diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java index d730d97b..fde76afa 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java @@ -100,8 +100,10 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT return Stream.of( //IPTCMetadataExtractor + testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quick.jpg"), testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-EXT.jpg"), testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-multi-creator.jpg"), + testFile(MIMETYPE_IMAGE_JPEG, "jpg", "testJPEG_IPTC_EXT.jpg"), testFile(MIMETYPE_IMAGE_GIF, "gif", "quick.gif"), testFile(MIMETYPE_IMAGE_PNG, "png", "quick.png"), testFile(MIMETYPE_IMAGE_RAW_RAF, "raf", "quick.raf"), diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json index c83ead76..3cc9114d 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json @@ -9,7 +9,7 @@ "{http://purl.org/dc/elements/1.1/}description" : "Gym class featuring a brown fox and lazy dog", "{http://purl.org/dc/elements/1.1/}creator" : "Nevin Nollop", "{http://www.alfresco.org/model/exif/1.0}orientation" : "1", - "{http://purl.org/dc/elements/1.1/}subject" : "Pangram, fox, dog", + "{http://purl.org/dc/elements/1.1/}subject" : [ "Pangram", "fox", "dog" ], "{http://www.alfresco.org/model/exif/1.0}resolutionUnit" : "Inch", "{http://www.alfresco.org/model/exif/1.0}yResolution" : "50.0", "{http://www.alfresco.org/model/exif/1.0}xResolution" : "50.0" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json index e7f493b6..4932606a 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json @@ -7,7 +7,7 @@ "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "RGAUSS", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "United Kingdom", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog", - "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "1885:03:14", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "1885-03-14", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "The Gym", "{http://purl.org/dc/elements/1.1/}description" : "Gym class featuring a brown fox and lazy dog", "{http://purl.org/dc/elements/1.1/}creator" : "Nevin Nollop", @@ -21,9 +21,9 @@ "{http://www.alfresco.org/model/exif/1.0}software" : "Adobe Photoshop CC (Macintosh)", "{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.2.0", "{http://www.alfresco.org/model/exif/1.0}orientation" : "1", - "{http://purl.org/dc/elements/1.1/}subject" : "fox, dog, lazy, jumping", + "{http://purl.org/dc/elements/1.1/}subject" : [ "fox", "dog", "lazy", "jumping" ], "{http://www.alfresco.org/model/exif/1.0}resolutionUnit" : "Inch", "{http://www.alfresco.org/model/exif/1.0}yResolution" : "1.0", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The Dog", "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Ray Gauss II" -} \ No newline at end of file +} diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json index c77d7ca5..1e6edbde 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json @@ -9,7 +9,7 @@ "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "The Gym", "{http://purl.org/dc/elements/1.1/}description" : "Gym class featuring a brown fox and lazy dog", - "{http://purl.org/dc/elements/1.1/}creator" : "John Smith, Jane Doe", + "{http://purl.org/dc/elements/1.1/}creator" : [ "John Smith", "Jane Doe" ], "{http://www.alfresco.org/model/exif/1.0}xResolution" : "1.0", "{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Derek Hulley", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Maidenhead", @@ -20,7 +20,7 @@ "{http://www.alfresco.org/model/exif/1.0}software" : "Adobe Photoshop CC (Macintosh)", "{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.2.0", "{http://www.alfresco.org/model/exif/1.0}orientation" : "1", - "{http://purl.org/dc/elements/1.1/}subject" : "fox, dog, lazy, jumping", + "{http://purl.org/dc/elements/1.1/}subject" : [ "fox", "dog", "lazy", "jumping" ], "{http://www.alfresco.org/model/exif/1.0}resolutionUnit" : "Inch", "{http://www.alfresco.org/model/exif/1.0}yResolution" : "1.0", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The Dog", diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg new file mode 100644 index 00000000..24598a0d Binary files /dev/null and b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg differ diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json new file mode 100644 index 00000000..a2adeab1 --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json @@ -0,0 +1,166 @@ +{ + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity": "Atlanta", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr": "1234 Some Road", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode": "30339", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion": "GA", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork": "info@alfresco.com.other@example.com", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork": "555-1234.555-4321", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork": "http://alfresco.com.http://example.com", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode": "US", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre": "intellectual genre", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location": "Rock Creek Park", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene": [ + "iptc scene 1", + "iptc scene 2" + ], + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode": [ + "iptc subject code 1", + "iptc subject code 2" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice": "Ray Gauss II", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator": [ + "Mother Nature", + "Man", + "Mother Nature" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated": [ + "1890-01-01", + "1901-02-01" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource": "National Park Service", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo": [ + "123456", + "654321" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle": [ + "Rock Creek Stream Bank", + "Pollution", + "Some Tree" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo": "rocky 1 and rocky 2 are big", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType": "http://cv.iptc.org/newscodes/digitalsourcetype/digitalCapture", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event": "Photo Bike Tour", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity": "Washington", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode": "US", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName": "United States", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState": "D.C.", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation": "Rock Creek Park", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion": "North America", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity": "Washington", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode": "US", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName": "United States", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState": "D.C.", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation": [ + "Rock Creek Park Sub", + "Stream Section" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion": "North America", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight": "3456", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth": "5184", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge": [ + "1000", + "1001" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode": [ + "ASPP", + "OTHER_ORG" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName": [ + "ASPP", + "Other Org" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage": [ + "rocky 1", + "rocky 2" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId": [ + "100-ABC-ABC-555", + "11223344", + "55667788" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId": [ + "PLUS", + "ORG 2" + ], + "{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition": "DAM Architect", + "{http://ns.adobe.com/photoshop/1.0/}CaptionWriter": "Ray Gauss II", + "{http://ns.adobe.com/photoshop/1.0/}Category": "PrimaryCategory", + "{http://ns.adobe.com/photoshop/1.0/}City": "Washington", + "{http://ns.adobe.com/photoshop/1.0/}Country": "United States", + "{http://ns.adobe.com/photoshop/1.0/}Credit": "provider", + "{http://ns.adobe.com/photoshop/1.0/}DateCreated": "2011-08-31", + "{http://ns.adobe.com/photoshop/1.0/}Headline": "Rock Creek Park", + "{http://ns.adobe.com/photoshop/1.0/}Instructions": "instructions", + "{http://ns.adobe.com/photoshop/1.0/}Source": "source", + "{http://ns.adobe.com/photoshop/1.0/}State": "DC", + "{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories": [ + "category1", + "category2" + ], + "{http://ns.adobe.com/photoshop/1.0/}TransmissionReference": "job identifier", + "{http://ns.adobe.com/xap/1.0/rights/}UsageTerms": "rights usage terms", + "{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName": [ + "Ray Gauss II", + "GG" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName": [ + "Ray Gauss II", + "GG" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID": "supplier image ID", + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName": "Ray Gauss II", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail": "r@example.com", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName": [ + "Ray Gauss II", + "GG" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1": "555-5555", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2": "555-4444", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL": "http://rgauss.com", + "{http://ns.useplus.org/ldf/xmp/1.0/}MinorModelAgeDisclosure": "Age Unknown", + "{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID": [ + "model release id 1", + "model release id 2" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseStatus": "Not Applicable", + "{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID": [ + "prop release id 1", + "prop release id 2" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseStatus": "Not Applicable", + "{http://ns.useplus.org/ldf/xmp/1.0/}Version": "1.2.0", + "{http://purl.org/dc/elements/1.1/}creator": "Ray Gauss II", + "{http://purl.org/dc/elements/1.1/}description": "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", + "{http://purl.org/dc/elements/1.1/}rights": "© Ray Gauss II", + "{http://purl.org/dc/elements/1.1/}subject": [ + "bank", + "park", + "rock creek", + "stream", + "washington" + ], + "{http://purl.org/dc/elements/1.1/}title": "Downstream", + "{http://www.alfresco.org/model/content/1.0}author": "Ray Gauss II", + "{http://www.alfresco.org/model/content/1.0}created": "2011-08-13T14:40:51", + "{http://www.alfresco.org/model/content/1.0}description": "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", + "{http://www.alfresco.org/model/content/1.0}title": "Downstream", + "{http://www.alfresco.org/model/exif/1.0}dateTimeOriginal": "2011-08-13T14:40:51", + "{http://www.alfresco.org/model/exif/1.0}exposureTime": "0.0125", + "{http://www.alfresco.org/model/exif/1.0}fNumber": "10.0", + "{http://www.alfresco.org/model/exif/1.0}flash": "false", + "{http://www.alfresco.org/model/exif/1.0}focalLength": "50.0", + "{http://www.alfresco.org/model/exif/1.0}isoSpeedRatings": "640", + "{http://www.alfresco.org/model/exif/1.0}manufacturer": "Canon", + "{http://www.alfresco.org/model/exif/1.0}model": "Canon EOS 60D", + "{http://www.alfresco.org/model/exif/1.0}orientation": "1", + "{http://www.alfresco.org/model/exif/1.0}pixelXDimension": "103", + "{http://www.alfresco.org/model/exif/1.0}pixelYDimension": "69", + "{http://www.alfresco.org/model/exif/1.0}resolutionUnit": "Inch", + "{http://www.alfresco.org/model/exif/1.0}software": "Adobe Photoshop CS6 (Macintosh)", + "{http://www.alfresco.org/model/exif/1.0}xResolution": "72.0", + "{http://www.alfresco.org/model/exif/1.0}yResolution": "72.0" +} \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java index 7d716978..91597a23 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java @@ -26,13 +26,15 @@ */ package org.alfresco.transformer.metadataExtractors; -import java.io.IOException; import java.io.Serializable; +import java.util.Arrays; import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import org.alfresco.transform.exceptions.TransformException; import org.alfresco.transformer.tika.parsers.ExifToolParser; -import org.apache.tika.exception.TikaException; +import org.apache.commons.lang3.StringUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.slf4j.Logger; @@ -42,6 +44,12 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor { private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class); + + private static Set IPTC_DATE_KEYS = Set.of("XMP-photoshop:DateCreated", "XMP-iptcExt:ArtworkDateCreated"); + + private static final Pattern YEAR_IPTC = Pattern.compile("(\\d{4}[:|-]\\d{2}[:|-]\\d{2})"); + + private ExifToolParser parser; public IPTCMetadataExtractor() { @@ -49,13 +57,12 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor } @Override - protected Parser getParser() { - try { - return new ExifToolParser(); - } catch (IOException | TikaException e) { - logger.error(e.getMessage(), e); - throw new TransformException(500, "Error creating IPTC parser: " + e.getMessage()); - } + protected Parser getParser() + { + if (this.parser == null) { + this.parser = new ExifToolParser(); + } + return this.parser; } /** @@ -65,9 +72,87 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor */ @Override protected Map extractSpecific(Metadata metadata, Map properties, - Map headers) { - + Map headers) + { properties = new TikaAutoMetadataExtractor().extractSpecific(metadata, properties, headers); + ExifToolParser etParser = (ExifToolParser)this.getParser(); + if (etParser.getSeparator()!=null) + { + for (String key : properties.keySet()) + { + if (properties.get(key) instanceof String) + { + String value = (String) properties.get(key); + String separator = etParser.getSeparator(); + if (value.contains(separator)) + { + if (value.contains(String.format("\"%s\"",separator))) + { + separator = String.format("\"%s\"",separator); + } + String [] values = StringUtils.splitByWholeSeparator(value, separator); + // Change dateTime format. MM converted ':' to '-' + if (IPTC_DATE_KEYS.contains(key)){ + values = iptcToIso8601DateStrings(values); + } + putRawValue(key, (Serializable) Arrays.asList(values), properties); + } + else if (IPTC_DATE_KEYS.contains(key)) { + // Handle property with a single date string + putRawValue(key, (Serializable) iptcToIso8601DateString(value), properties); + } + } + } + } return properties; } + + /** + * Converts a date or date time strings into Iso8601 format

+ * + * @param dateStrings + * @return dateStrings in Iso8601 format + * @see #iptcToIso8601DateString + */ + protected String[] iptcToIso8601DateStrings(String[] dateStrings) + { + for (int i = 0; i < dateStrings.length; i++) + { + dateStrings[i] = iptcToIso8601DateString(dateStrings[i]); + } + return dateStrings; + } + + /** + * Converts a date or date time string into Iso8601 format

+ * Converts any ':' in the year portion of a date string characters to '-'.

+ * Expects the year in the format YYYY:MM:DD or YYYY-MM-DD

+ * Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T': + * YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss.... + *

+ * Examples:

    + *
  • "1919:10:16" will convert to "1919-10-16"
  • + *
  • "1901:02:01 00:00:00.000Z" will convert to "1901-02-01T00:00:00.000Z"
  • + *
  • "2001:02:01 16:15+00:00" will convert to "2001-02-01T16:15+00:00"
  • + *
  • "2021-06-11 05:36-01:00" will convert to "2021-06-11T05:36-01:00"
  • + *
+ * @param dateStr + * @return dateStr in Iso8601 format + */ + protected String iptcToIso8601DateString(String dateStr) + { + char timeSeparator = 'T'; + Matcher yearMatcher = YEAR_IPTC.matcher(dateStr); + if (yearMatcher.find()) + { + String year = yearMatcher.group(1); + dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-")); + if (dateStr.length()>year.length() && dateStr.charAt(year.length())!=timeSeparator) + { + dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator); + } + } + return dateStr; + } + } diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java index 1ffa420e..e43677a4 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java @@ -39,6 +39,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.net.URL; +import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -58,27 +59,84 @@ import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; import org.apache.tika.sax.XHTMLContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class ExifToolParser extends ExternalParser { + private static final Logger logger = LoggerFactory.getLogger(ExifToolParser.class); + private static final String EXIFTOOL_PARSER_CONFIG = "parsers/external/config/exiftool-parser.xml"; - public ExifToolParser() throws IOException, TikaException { + protected static final String DEFAULT_SEPARATOR = ", "; + protected static final String SEPARATOR_SETTING = "-sep"; + + private String separator; + + public ExifToolParser() { super(); - ExternalParser eParser = ExternalParsersFactory.create(getExternalParserConfigURL()).get(0); - this.setCommand(eParser.getCommand()); - this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer()); - this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns()); - this.setSupportedTypes(eParser.getSupportedTypes()); + try { + List eParsers = ExternalParsersFactory.create(getExternalParserConfigURL()); + // if ExifTool is not installed then no parsers are returned + if (eParsers.size() > 0) { + ExternalParser eParser = eParsers.get(0); + this.setCommand(eParser.getCommand()); + this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer()); + this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns()); + this.setSupportedTypes(eParser.getSupportedTypes()); + } else { + logger.error( + "Error creating ExifToolParser from config, ExifToolExtractions not enabled. Please check ExifTool is installed correctly."); + } + } catch (IOException | TikaException e) { + logger.error("Error creating ExifToolParser from config, ExifToolExtractions not enabled: ", e); + } } - + private URL getExternalParserConfigURL(){ ClassLoader classLoader = ExifToolParser.class.getClassLoader(); return classLoader.getResource(EXIFTOOL_PARSER_CONFIG); } + public void setSeparator(String sep) { + this.separator = sep; + } + + public String getSeparator() { + return this.separator; + } + + @Override + public void setCommand(String... command){ + super.setCommand(command); + if (command.length==1) { + setSeparator(findSeparator(command[0])); + } + else { + setSeparator(DEFAULT_SEPARATOR); + } + } + + protected String findSeparator(String command) { + if (command.contains(SEPARATOR_SETTING)) { + int start = command.indexOf(SEPARATOR_SETTING)+SEPARATOR_SETTING.length()+1; + String separator = DEFAULT_SEPARATOR; + if (command.charAt(start)=='\"') { + //get all chars up to the next \" + int end = command.indexOf("\"", start+1); + separator = command.substring(start+1, end); + } + else { + int end = command.indexOf(" ", start); + separator = command.substring(start, end); + } + return separator; + } + return DEFAULT_SEPARATOR; + } + /** * Adapted from {@link org.apache.tika.parser.external.ExternalParser} * due to errors attempting to {@link #extractMetadata} from the errorStream in original implementation.

@@ -95,7 +153,9 @@ public class ExifToolParser extends ExternalParser { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); - parse(tis, xhtml, metadata, tmp); + if (this.getSupportedTypes().contains(mediaType)) { + parse(tis, xhtml, metadata, tmp); + } switch (mediaType.getType()+"/"+mediaType.getSubtype()) { case MIMETYPE_IMAGE_JPEG: parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType); diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml index 7d77927c..076dfe54 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml @@ -5,7 +5,7 @@ exiftool -ver 126,127 - env FOO=${OUTPUT} exiftool -args -G1 ${INPUT} + env FOO=${OUTPUT} exiftool -args -G1 -sep "|||" ${INPUT} image/x-raw-hasselblad image/x-raw-sony diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java new file mode 100644 index 00000000..fc35dbcf --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java @@ -0,0 +1,48 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2021 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer.metadataExtractors; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import org.junit.jupiter.api.Test; + +public class IPTCMetadataExtractorTest { + + IPTCMetadataExtractor extractor = new IPTCMetadataExtractor(); + + @Test + public void testIptcToIso8601DateStrings() { + String[] testStrings = { "1890:01:01", "1901:02:01 00:00:00.000Z", "1901-02-01 00:00:00.000Z", + "1901-02-01T00:00:00.000Z", "1901:02:01T00:00+00:00", "1901:02:01 00:00+00:00" }; + String[] expected = { "1890-01-01", "1901-02-01T00:00:00.000Z", "1901-02-01T00:00:00.000Z", + "1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00" }; + + assertArrayEquals(expected, extractor.iptcToIso8601DateStrings(testStrings)); + + } + +} diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java new file mode 100644 index 00000000..4303f255 --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java @@ -0,0 +1,59 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2021 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer.tika.parsers; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class ExifToolParserTest { + + ExifToolParser exifToolParser = new ExifToolParser(); + + @Test + public void testFindSeparator() { + + String testCommand = "env FOO=${OUTPUT} exiftool -args -G1 " + ExifToolParser.SEPARATOR_SETTING + + " \"|||\" ${INPUT}"; + String expected = "|||"; + String actual = exifToolParser.findSeparator(testCommand); + assertEquals(expected, actual); + + expected = "TESTWITHOUTQUOTES"; + testCommand = "nothing matters until the " + ExifToolParser.SEPARATOR_SETTING + " " + expected + + " now all this extra should be ignored"; + actual = exifToolParser.findSeparator(testCommand); + assertEquals(expected, actual); + + expected = "Test something bonkers 112!£$%^£$^"; + testCommand = ExifToolParser.SEPARATOR_SETTING + " \""+expected+"\""; + actual = exifToolParser.findSeparator(testCommand); + assertEquals(expected, actual); + + } + +}