From e11cbd51807568aa0fad956ce39a84f6f7deb849 Mon Sep 17 00:00:00 2001 From: David Edwards Date: Thu, 6 May 2021 08:58:42 +0100 Subject: [PATCH] ATS-892 Convert ExifTool separated strings into collections for ACS consumption (#397) ATS-911 Add regex pattern matching for date replacement --- .../transformer/TikaMetadataExtractsIT.java | 2 + .../test/resources/quick.tiff_metadata.json | 2 +- .../resources/quickIPTC-EXT.jpg_metadata.json | 6 +- .../quickIPTC-multi-creator.jpg_metadata.json | 4 +- .../src/test/resources/testJPEG_IPTC_EXT.jpg | Bin 0 -> 27421 bytes .../testJPEG_IPTC_EXT.jpg_metadata.json | 166 ++++++++++++++++++ .../IPTCMetadataExtractor.java | 109 ++++++++++-- .../tika/parsers/ExifToolParser.java | 76 +++++++- .../external/config/exiftool-parser.xml | 2 +- .../IPTCMetadataExtractorTest.java | 48 +++++ .../tika/parsers/ExifToolParserTest.java | 59 +++++++ 11 files changed, 447 insertions(+), 27 deletions(-) create mode 100644 alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg create mode 100644 alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json create mode 100644 alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java create mode 100644 alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java index d730d97b..fde76afa 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaMetadataExtractsIT.java @@ -100,8 +100,10 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT return Stream.of( //IPTCMetadataExtractor + testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quick.jpg"), testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-EXT.jpg"), testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-multi-creator.jpg"), + testFile(MIMETYPE_IMAGE_JPEG, "jpg", "testJPEG_IPTC_EXT.jpg"), testFile(MIMETYPE_IMAGE_GIF, "gif", "quick.gif"), testFile(MIMETYPE_IMAGE_PNG, "png", "quick.png"), testFile(MIMETYPE_IMAGE_RAW_RAF, "raf", "quick.raf"), diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json index c83ead76..3cc9114d 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quick.tiff_metadata.json @@ -9,7 +9,7 @@ "{http://purl.org/dc/elements/1.1/}description" : "Gym class featuring a brown fox and lazy dog", "{http://purl.org/dc/elements/1.1/}creator" : "Nevin Nollop", "{http://www.alfresco.org/model/exif/1.0}orientation" : "1", - "{http://purl.org/dc/elements/1.1/}subject" : "Pangram, fox, dog", + "{http://purl.org/dc/elements/1.1/}subject" : [ "Pangram", "fox", "dog" ], "{http://www.alfresco.org/model/exif/1.0}resolutionUnit" : "Inch", "{http://www.alfresco.org/model/exif/1.0}yResolution" : "50.0", "{http://www.alfresco.org/model/exif/1.0}xResolution" : "50.0" diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json index e7f493b6..4932606a 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-EXT.jpg_metadata.json @@ -7,7 +7,7 @@ "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "RGAUSS", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "United Kingdom", "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog", - "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "1885:03:14", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "1885-03-14", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "The Gym", "{http://purl.org/dc/elements/1.1/}description" : "Gym class featuring a brown fox and lazy dog", "{http://purl.org/dc/elements/1.1/}creator" : "Nevin Nollop", @@ -21,9 +21,9 @@ "{http://www.alfresco.org/model/exif/1.0}software" : "Adobe Photoshop CC (Macintosh)", "{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.2.0", "{http://www.alfresco.org/model/exif/1.0}orientation" : "1", - "{http://purl.org/dc/elements/1.1/}subject" : "fox, dog, lazy, jumping", + "{http://purl.org/dc/elements/1.1/}subject" : [ "fox", "dog", "lazy", "jumping" ], "{http://www.alfresco.org/model/exif/1.0}resolutionUnit" : "Inch", "{http://www.alfresco.org/model/exif/1.0}yResolution" : "1.0", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The Dog", "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Ray Gauss II" -} \ No newline at end of file +} diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json index c77d7ca5..1e6edbde 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/quickIPTC-multi-creator.jpg_metadata.json @@ -9,7 +9,7 @@ "{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "The Gym", "{http://purl.org/dc/elements/1.1/}description" : "Gym class featuring a brown fox and lazy dog", - "{http://purl.org/dc/elements/1.1/}creator" : "John Smith, Jane Doe", + "{http://purl.org/dc/elements/1.1/}creator" : [ "John Smith", "Jane Doe" ], "{http://www.alfresco.org/model/exif/1.0}xResolution" : "1.0", "{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Derek Hulley", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Maidenhead", @@ -20,7 +20,7 @@ "{http://www.alfresco.org/model/exif/1.0}software" : "Adobe Photoshop CC (Macintosh)", "{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.2.0", "{http://www.alfresco.org/model/exif/1.0}orientation" : "1", - "{http://purl.org/dc/elements/1.1/}subject" : "fox, dog, lazy, jumping", + "{http://purl.org/dc/elements/1.1/}subject" : [ "fox", "dog", "lazy", "jumping" ], "{http://www.alfresco.org/model/exif/1.0}resolutionUnit" : "Inch", "{http://www.alfresco.org/model/exif/1.0}yResolution" : "1.0", "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The Dog", diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg new file mode 100644 index 0000000000000000000000000000000000000000..24598a0dd2466324dfb7960bc65cd611c38c7b3a GIT binary patch literal 27421 zcmeHw2Ut_jv*<}7p%bczh!H`Qn$SV%T|kg7q8O3@ku*~%Dk5O-qGE4|qSzZ2R1^y~ zR761q1T5GED+&T{&q*O5`upAgefNIf`<@4L&Ya!Z+1c6IHamOldDzoT80C}BNQ9u^ zV8|4LAbCg~1wj}9p`aviLy2J@0HYBYj(d#4iNnz#TonR$hz5R5C^ZDeflzN00fAKk zJPxHH#s|bv+F}?`Lg_(rNL~;s1~o(qPeqNAp-)EW5CL{F${K;S0K5=227y%oyg6q* z0uundcL4*!ifNvOAnaUFCh+G$7DO>@a zX9;;x*c`U#PV@;0C)$&|AxQD(90)QYNS+=vP6C}63RC4LbGSsWaC_pYU<#GNh66@J z)+DlxGs)hWOeT`8ovm%1?QEbhN(Rx7BINUl0Rd3?USgkjP*ljISMHenUb$m7_mR6) z)|ld6S!0gM$~tGGqz*7=a<)j}@`Z-Ib+*t%1fxN@RzeUS(m z2NbAK)HVP`0As{BFf3^&Ff7tzB~bRUh1Lj7pr=^BLR*ASP%3Ovp&bIlIB77g0W}KJ zCxR%@-Xu?0o=_o~**XIW{V@_yumL}L@Us&? z!2_m_x&of<;Af4zqh?XTQyr31@dHmMlC1;D+S-ge`&d?A(I`osVV|O4#; zdKy3>@jaJOZh%WU6mkKE;R^$LWvm;=8sgqole2g+>zd^ zBWOP5I|cTG?nOSOe>yhzQnm2eXJ3;wh4YTDy?p=qmt7~WJb3XnDlv6_{_c{iH7(y9 z{GyZ67OX2eSyB74T^&NB0k>F$Cp=Eh4q;(9Spy3=m~N;kXPv!D3+CWrSXG;iQ9?5> zC(kQfo65KSXpDmyz>l;mzXzDuIf&*HX)mBZm9p@A4orGk=y?n&V~{-6AvfsdmdtI< zX4LpEfs4jwT|9lht2DBhm16p&`HkyH<4S9_(v&Q_vTf(W?dt+mR#7Kg;)s4NT}Dx~ zz!JM#gsJ!v+O{X`)}1|2R_NjI#9Q-Qxw_?7Q95|*$)`HYyjyyp-1gatAoja=ERPg? zW$P;Xs#E@kf`*eR=iVQiCV#%d)^=s>Q6+t~!#QbAjo00e)Es%>HoP)Jb>zO3s!N~G z-R^8gE9-3aE?9idK6q_9F)bE=`WQV}fRX&LZFLp450GoO|%)5Z%dce)aAqAw`{?tM&-( zB4YIQywRU%9v#8_$?x2KRMtK0cpP)j$I`-Lp^ZUf<ssB9m zVz?$@T=HY{S@_9|s0~GhwHlK(2~#S!wicZ;X4jVv&ABn6IyoCUe!4t0i&FNLVKV)C z>bY$_cUy*Tm`IJ)Q@HOwg!*Gj>x@LlQ$ru;?pbx{Q`XU9iesH1Hn#FW{Y%2stci_} zqgMoHB<;Vc*HO18w=gTR(Z|<)$PwLp<93(3Si7%OQmg8L96oRPvH0dI;rHgHhmR_+ z&vv`c&l=NSlWe7uviI&? zsVXa08Z_OlnSbZ>@~)O+4Xu$!RVQ{0Z@{}7%~z}JaND?PFqAyH@m0h4r3RbsA9;V^ z%8xG{k$P8~lqRxf@2cC}u(dObY@8eZ>8jl_60?b<_T=MzxAw4*DLqh6)iY8s``r7( zgAXeVp>GcXjhd0HZ|0BQmpSq{!R4so>~$w@^+4pPBKdXpSM!D#;T>7TE$3^^R&SeR ze;>2x?%-v+GY>VKc$e1YNh@GqjP+L9A!6m8SJ z?7@k5-^QdJdtLU^a29oU(IO(FcqF-dlR?-1_Gx#Uv00PSKJ-8nCM7-GGv}rk=JXJS zMP0SCrxmhaKdnVyTwcpc*Kzw&)U8xErNU`j3p3VocWK-Cnx<(7YV4K?m#Hwev0EqH zPHg0-W~$8?**V=}#L>XE4&jw~KYUg&o_y1Jzkk6tcEkC0r{pv5l*pOR?~2VJTa8%E zf|k;{FH;)Z;=dc-IJ!E@?7;aq$-53?-laco?t$=h)=d4@=Q9l!)(;6tc^or@pEhSo z!=xc6-=~~A(0#^8uQ*m=sAf%|*P@c*ef8eYh8EE>M|Q7BeBXGR)A%m@^}{abrN|L%LTVAnXY?n#&nYZ?t4(!YoKku}>jTlvN`(RV@x+Ax1ry9?9U02G!pGv7-w)&0! z<~UAh=LY&MRZ<#@C2XDjT;51M0DrMYxaPscqc;vvH^)6YQTo(j=Ij+!S=Eilx-F)@ zaC!8={;cyd_Tko<@6(yg5h>^V9Fl%ca@~0LN%3UiiKjH>`44clJrS7de=3pi$1)ccKqFRZ^euJm-W@} zA1kzv;k>llb}gzQc>I`+7-9}>8Aam;`}iW|hyDw4Vw>EY4%D4n?v08Xzt5X?kF~zN z{&5Q9XvLfuLx+I6l9RLfTU$yWUXG@0+;4Y&=cI?J`8oO?KU+%K-)6e;XPjHKWYu`T z10LF^(3ndBTzq?6D>be?Yq-Huhx;#e@aJjkB6ZZo|Bf_=J|OoYLEM zkOiziah-$iut~tJ?@^O|!%9i(&BA@BZ8EW+HF8V;Y-(2-4L38Vd z3-PP2y;2=A-zN6PI~ASdr;a;a_|WkBL@B6)ZRg&+eOqa(7w?mL@6pJ^H5=5wY<~02 zbPKjzjcUUjyV^4LIc2j)`1B3VGsZk@@~e9M(y4RO z&v`3ya~DhruDCbzxM9J2{I_#%Gq-dJpU?z)?`A(`T4v;>eQ^7-v3PlIq*dd&rZ;xb zinmUdhS>^ZYqVo5uBlp_zPQ|A$BAdv79(CeuehD|wbuQ1QFY4WPm?BM>Qe}x-0ME1 zRi~YNxGzaimqM{%e=D7sZ1S`9PE6~8&m#$g{qL=cdi7lBQ0iQ|=f*W%N~J^8Go#~# z%vBw}omo!C?zZ{ryK^@LSaFND^;B=kN?Lugx?#jK(`#Goe`Gl*RphxJQae%YwqDRW zA|f#;h5W?nMLN%alxu5zQJr9)#g}&ZwjWm8?gfP0ZQS&3>&zagC7|x5an=5;BeUA_ z#?4+(J@Mu7w)H)brQ^Ny__2j~@u{Wnw(j_Ha*5HMBJ7QdH;qS(aG9`(E*gzH_k+E|hxN6&y@EWFZ& z&;or_EQA?i?DyY(f^`)dLhGZ`A?p9aid`Q)15%bkXk>>fIfN@q|9Ig^MQ~m zU~t%ceROF5RFRO1w?{D1gGWtf2Z9~cj)bbYiOr0C5_ zgQ)>ygg)8!Xt()oGjrB4C3pU?ehNX4D`|D376!)`AqM05^%l%K&-Hyjkm` z2SZrcPJ#ejU>$)0MILN1p~4mu_!TE90*FbI+D`P*0!Xc&i9#Q(0~z);nIM`(^n|S{ z5wKMSmVrJR4-=&bgn<222m|T`e#-e#QWhAdGr=O8$qT2mg)9jSKg0Yy zz2Ij6TR@k@!7v;K@Dbc#t|v2r!(qad)eySjjC2^?lOce^VO&vubl!Lhiyq+<69Ixm zjEEA1@;IEtaJoRq4M~^=uc?*6I-Un{z+WOngu)>@B!Gkv7bXQM0)-M|gExeXCsV){ zv!|ROWH1E`HbN4_D# z!=&hnsi1Llc-|BN1ug-?WN1<-U!o!~3?qXlqCP?NL=l;O*+u}%-$2=hQ<R;U<(#_5x)UD7`Qa39Bu{=W*69P6FEd7j|o!*HdXji6gMc43gPxE66z0GB_@Ix zCb%Pw9KfQG9UC@ELqyiBSdc=31KYh&5Zp?{AqdUy2u;umdj`BPui*Frqk8^3afsaH zk-Wui7VQq&F!+gOYyd-u1xzG8J@^$94Q?Xt;a3P4lOWiZ5A1aU(4X+ZeH`dzd>}t4 z9tr?Z1mp!EA1DSuaOr=&WW!)eFvUKXEP^DWx0`|3LMGfB!rKJ61dfnRp zA(IKXX-M@C1oqQk$g@|^5jMhl)fi?C$7eFAbbb^w2yTd|KKaNYaUfU&{P4(2fVWIu z_#_@j$n6t^0|QMGL)ypsgu~QBkT5_)g(4Ji{OD{t4-BNhAPYee8C=PDp(sLzL*SSI zRuU0B{-?{20j;CYK!XUsZ%8mdsqbKfqc8;#l%zgbs#H3Y89`4M1n~VMf`cU8oxCKt z53)ishd0B6$w-n&N=;OFe@QSXIi5yOqzIAjS0R=J(sBM0u7vbA{eqG9I10`PjMG4! z-P#aX)b_Lx7*=7VAHu>IkP_Utz>P!PYEX_qMRY;9E2Il7&VMb^e=X8~Ez*B2(tj<| ze=X8~Ez*B2(tj<|e=X8~Ezdw zLUp1Uy15d^raPyzxGcJWLQH2d*?i}87ej=&Gl1b>D?=iJB1myDM7&mrF~OlkFAk4R zw6nCapaKIb(ca#YY-?w4Pd0}Qs@7H{@Xy|Y41CD!ovj^+;>!@^#-k-Vhk5&o^8#-! zhT=k{rKMS>*;sOTNmgVhCnuPOwY3Gnu;6E~1(b9PHh+YKf;XK{P#D_IOA z4}>jPkO+G;sx%oFESiyIG^!PyNoN6LIUl4TOL0LyVXy&;FHO>?C=%X~MI&D}RdDd1 zQ?OW4Y`(xJ6-e(-%EgCQz*b>&(Pn~AD&TX3q&N&FQwU7*6kwhln1~EQABnT*GzJA2 zi)B+WQW^9#3l>hNURPk`3(fe9%tmfV5N* zorCB}6lz95XoQzHBZOvX7&$m!mhyRyMT< za0cb@_@Nv=12(0*81~r`lEw6=Q)o@0Ai)--U=9sXu#pW9@bUs%CW#CtOd4$a5CeR??8#2{_7>Kb zK#D-4oIxr^Vg@W|M8Y-}aIZ>^kof0{Tq{MWNKF7s71$h*^wY>**Fe6$V&b54r-A*1 ze@~t+Mg6Uxry)_&(%|mV8Q!^KGQl>1kirD!O1Q%W`v}q?ViFwIFHI1KN)f5?FFM+X zwpdrZ5XnY{8xm<$=R^*VMFG_aDkF(*HH}M8Ldq)YLY<@O35eFg-K>k@uf}~@++eWz z2rG63PY4(Ul@T25?9HJH;j{tXE`~xOgXU~+P4c!S`Fc3oksN)mUW_DVz|HvSh$%NjCN*2aw#$nq==tCOd+ySVyv>kCVNR4T$lTq!rnG zDa`*MbD56U85EoccKf6nhivE=5+z-$Gt#F6&FSM{j%ZN7-BOt}f2b)bU0C37*9imw zo`yuwDLaSLc?=43ypWXu_bsWdAWM6*RO2tI#lu z@gAXK8Ub*-2^Vs?Ot52*yud2U7Y7iLW5)YDX7H&@4j=F%Ee@Ef z44kKjpGD;OKnWGN7XFJ&2g@=IjQ8+p;9^L&w6+A*EE;K@;VlHv%8`K>hJ6`yCaq6w zaF8TE7@lyz1EUBDOjLWvW<(sw)QrLu_wNikUp$xfq$G(UCc_o!5p5L*KqoDsVH1!r_cSVi4fs(_YwdvqSc(3#6QcmpVw3W zOnQYYr3>-^1B+w{{jW3nUvP~3J@b7T2D2#fq%M*VhO7uWk0t4n|5;!nIS_?#N+MX} zu=u~p_BXkRW`q6=u*BnK3{dCb#2;8uJA;lH=`;D1R66K&MS<`Fr2kZq1SWrEvV$$j zK{OR~vi2nTI(Yip`}mS=ylg?2DoUFM9C5@;Ei$}1jUbc3aXn|Utp&*roUxLi@i~ct zG%$GqgEBnN6&Y=07Xx;dWTN!spClDHOvp}xXS`O1aDOLiQ~%YH{PU!?UrBzqq!FS0 zV@diPVf;-=kajC>Y5i+XI!z6w1Lq@2SJuD%%4mnUt3g}?e+@?b1P2HPz8o^4vJCyo zfTXjBC!3N9HZt1NY2s>;opVS>NEkg_n85t!#S@JA>%v?%TQHLZo{v;y3JHdFZ%Z%B z-f|a*;r&5 z^RGL!e(Rh5btLXN2cj<A~-cDnnQ{KM$TD4a|W; zc)-U9%mfC?R@8oF&7^$@gun}Vu~LEQp`#PYf&`ZIG7~PimOVlwmXGnsf}Fw>9ashn ztW@BP)K?`W$N}us@nGo&&*l2BdpttCB>r^cIRZeTpKS>gq%W5d|E1}LjI|(KbmH2x z?=%4s^8j_~5hBT`Z}y0Q1|HOdk%=Y|$OWu-Wa?M8Y5$HGf+JHkpx8)|%yI!SYJ=$v zpDtP+N(`^C{ktz&3Fok4{qw*V$^o`ocujZ19^=}ndtc&(`$YSEZDLZ8S6ak#j|A%DvKc!KKh{dbSKGpC`FCrTY_sd{* zoDuMkgATw9;Uxuti9=`*Y=!GbB|t!D!I7ZLv+(frLVjR?)BKvC6{{vGcktly(M)7bM4kinr z@Bd>v=r2gbve?HeD3-4599d+4?1Lqm8p^JH2edE(<6@YfM`U<-FH_<6;g{DSj4gU!HsBBA{Z!~-+K1y1ZBL2BL`{Du?5ypeS%$3a_a9${C2oi+fGAlulVY6>aU>2nVi!hr`7FkBYfN3Xg zKa#Xk6l)QREx7ujAl6bB!+!SAK4pju114~>#S;v4;7dQSDHTy}*%lzl^v~jwU;#lP zeYnglP7(xMKVVSmbK)xT4f*^0D}lcf_$z_G68I~DzY_TWNCLfHt8_Ln6Qlv3)Sjox zF~B)0J~S-EHz3Fdaj;eP4+72xXmGTX1sud+Z>gAxu|)iBaI{kaoH+%DI4M*QpWd9N%senW z22PNH1@v^lCpc>_vBwfeQAZPj+pQzfpPt5~3j`LSz?GQ7qX9=>7MH@#0H6Is$_%H1 zG=6bcLxM%OKSs*n{RxEYRuunv0#auvom;(Oy(xRcI0fKjGB|;aS=k$wumOTj&VwM` z$Gu@j+rcLj=0Z@(9hvgz!1^Md1;cL03_8^k7O3?4M+7P6?*WiQppi$t&y*1Yyg*1@snbgT6tX;5&(UlnP1{H5fGvWsEXIS)&|Lt|%W=5Go8c5j7Q+gkqzFs9C67 z)DrMv(v7GcsJ*BosMDy6sOzXI)FV_g>NV;UssoKdE2Fj0L(wDAR%l1G2RaZPfu4*` zLQhA}K+i=lN3Ta0p!cIo&==7+(GSo~=(p&v;4_v=7#+-Tj0MIKD3C;oMkDG{_hMS37 zira!agu8&N!ac)%#pCgV@n(1zdxmE?!XTg&^($I7$i zbL8{oi{;PD-;;kS->E=QFjjC@2vMLZWGJjqC{#G3a7W>VLZ_m-qKTr5Vua!}#caj( ziU$?1DAp@}R#H+jRB}?9pu|v`qqJV>u+lZ9W~Cp>8p@{1UdnOGLgnSkMambHA1QxU zQBfJG;;s^_B2ZbbvRCDj$`h6Es+y{nssXAr)hyLbswJxTR6nRGs2Qtys7+Cup_Z?9 zOzn=^JAwjXB*BY7APZ_+-e-K2xj z8L1PX!_--!b5y5Br(@92K^}wXgBA@sIOyJ>cHJSm?z(i{#kz-dALw=rCJy!)oHBUT z;FE(J^ssuSdK2{0^tS3<(R(*UYsi=()FF$993AplAFXeyAErN3f2aN}{jWm}hWZWV z4&5;H($IGXItK0r(+t)aoHuwqOmmp)FvhSo!^(!eA!-vnh)m*o;$>pnaQ)%_!-d0l z48J|R)6m2)(lE#HkYU3J#SxAp5=X2VQ9h#0XqZv3(JZ5VMvsjZj2(@Wjn^4h7`Kl! z85uot!N`*%UzzBd1enY)*=JHeN@bMWsOh5$M%^Ec8|^qcW%QQOcTLfzcBaXun@sPR zq0Q{g7-n0{s?6ogoy^(h1?IID$`&3LsTTVznk{uKgDi6_Pg;JkGO~)bT4hyX)lIS^ zF-e7_ImzT5q!$7qjf9{W8$c#=GM zo+mupy_~%=y)JlTy#2fvd*Ala@QL)<=<~$a&^O665Qd(VN1YaxPvio)Uj- zs@&A*sl`)&QbH&Nl&=W^37Zqzs9w}{)VDNu+8Wwxx(hvz{xZ=eF)#5|l1tL+q}R!A z$@$6e7+#ExjE~d&r)`_oo)VH$l+weDW*%b6v!=36vo+W$?8_VjPCBQGYsQ_=ZJO>p zeeLu%ULbEbAH|={KPAutlc`(6QNnq`=2X|zO{w40qSB6~tEaQmZ)KQfEXsH}!*|B6 znb?`sndP&F&&rW7b!tC;_5m|GyUd-{EvnN|IJ0-g^hm@0-^CdSr_sra( zbLY%`HqUR~zWJ*2dGl)*j9IX4A$B2S;q65>i#9CkUQAnjV+m-R^4lxyS8QF0TghHov&wDNo;>xu8F|lE2d^$!LtL|X&6l_q=5Jf4xK6mP zX?@W8QyYvntlaQ(WAeran>;ri+N{5M(dPCov@Q3xx^F$WZRoZo+d8&0wm;nAx1*$B zWI_H;+|JaUErn5q6}udF?cJ@nd&%yuB6d;Jp0GWa_d4v|TdZHaVjp^+aNq0war^Hc z@IG+rpvA!*hjb1tISd^Z9)5FV>XDkGfk(@aIUYNB-1zwB6PhO$m!L{AO4?2)pKLlc z@l@66ai`1Aj5%|n)Uveb?C`Uj&S{@pabEHK-19wUGs?bR;9hulG3ny7^7!()OEH)3 zU!HLJ_LaaZ*RJ|ry;R{*QFhJc+S%*Q*H7PYym9iT{mqhFcDGJc+Et#oZGXGuj>DZ( zcb)E@sTx~#{+`>ti`CxMSMLYhzxiPNgQ}XynuoPJt2oO-$X z#SMgp{3rTP3K~Z@9%{00I@j#oeCuiW)4FHWXK$bLpZ|O@w?(BT|K+fkdtX_DaJ<#M^;TO<+l!C9k3FB3ejfa}=!@-_@~@#^pL}C| z>ug{Aeem}^KOBElbVPT&>`eQK|GBQP5geHZWRD3NhLjzc>{V6P37Ui> z7n?n&hY>&W3hs@dVl`y5hv`n$1i2wuVbP+j0F~i@27yAHo=sY{BuUc6gA87gV*~r`$)tXh0@9LdSS=wr9@N}us3w>So zhb+(>6S8c#GEf5=eHwTaz$JFVLJcJ;{Uvp(z2*~uqUEndg7zu3{|7cye+ zmMz9@HlLQ}Wbo29*I57MrtAKmLi?%XZFg+8eLH^9vOM{l9LsW#!IOu170hN=X_e1n z-Ai8jsE}EF?endSnk8fE(DgdwgQ`+aJbn?f{u`ABc$WuqN6Dcrt90d3T<_g*a9+P~ z|1dL6BOM}PpLq&7$!MHX;;Q~0h2+Wf3lY32)V^#|tyl~U#Uv`v<=H#feY zm}DN#8LHP>UR8W0D+X%f7S5RQUMY0iu<%3ilP$~_ml)7pl490$^+5ILF-tG~RO#r{ zir%$4_Pd^8oPOAeM;%8y>Z4a2t`eB6@E<(1GWI0;?9WYs&N%JTz~x`6e9Y)(?uS!n zl)KSBuZg+w-t@qOdwIK37O!49Ufuka*{}=6oVQ1Fl)KlhDEGh}PI24u;(N*a*`oy8 z5Q(5+iF}*>()H1XZy^_M87SYv*CdWSKmFC^%N0w5X83(s<*29=$>{>e35 z);LNEKR6@!yV<@OpNdoW4tq=vP9L2;bnijKcWZ*)PClx0NG&%zZnER7)o#=3=9{*W z_+O&S_Bz?!G#bur3AZjT;!#F=nka|j4m%#O#f+J<<>8d?U2Eg#5!OxkysP43ppVMI z;CxrJ9w^9Iqbr%QeB0dxE^mc8J51z)G(F{RzJAn$$h(vzO6^^C0w4mak7KE?%iI z+@j+h@$a@2c}@1+a&1W2?yrNr_PNp9y&qLCHXkxJY7fil=@KXFvp?CXe(c?Pznl z*#lOlcSNUEM!F!Oq^ol){jlst|qMAI%v0@{N8B|!#FKgF-|s%JA`;bTiE*Z zx0ddp+&%X-VWOsv<*4_Lr_&5dHiIuMH(U;$eCEa}gB7<5-*iQ5y=dp%D5YocBHz|* zH@>ZL)1Gi?W=BriWOupeGnHE`+TU8F*Am_x%x(N!JAFhPFUs`$&kA)P;c~0=kjKN` zL2@7V(YuZm%-o3ATHl>$M9*6rpH_Rn{M(h&ub)nQq%&&X;{E%-ym?{GeHNfJZ~fyU zm5r01L@#{6-G6pjaox11+p}*xaC~A{lotBop{nUaW^_b-xueEfg;gIka`MYf^AA^Y zU0fXIzaI9{T8;hyk5jafPq+UN+ljrDUiX;TzGuU*d0Kah^W{$`;6A2~yQWupnLF8F z+x8`Il`Y)$v>!~q+tj(>s9)*)&ykj;arKll<{U2NasHe&lQ)*n)?GDWw}Wc(n5U*L zi&YCcURE|fUARo?!qa&-;}-1oP7XWC3VSi(qPF(Zhbc$)kssWK+RAFl!xfXA4MtCw zb00m4xvW9)EXO>>ytFB*W{GxobERg)KEJ&y_tT4B(mOZIu^#u7)zVeTXm~h}Ubl3% zeA`L~@+@wJrJJ?r2+AU6Q*OkX~Dt0E^S}?pj<74)bMXU1$RUEW#44IjKFlB4I^Ff-)Pq%MbmWfV3 z{kkkOvj^9;ZVd_P45~3OB5Ll_*%6`!THBd&T^-?39aPFK(`TJ~n&LcdM77U7oIPqTUuJ^f`1<%h_$F z-|z1+(EeC<_mH+}^sA%IrqSiQ#;tvkTbUlc_%rXt#gAcU&93|`UQk~4{SHVA{#OI> zP)?v*PbPdE?^|u+W}!h??s}JolN)}sTke1^@8`1c9X~$FUE8>taq>tH^s^-Ev$R7X zTx-dmvt~(()10w!Io&j;C5yJ7lGjxqtcclUj@ws`e&MixyWIG*dvj(wDA5n1I`wfV zJ~aQK?!#!Oax~U_d;X<$`Pi%Ow7qsoT&388u`4ltmz&q)e4rA7`?Hc$1t^Xm+C1S# zgz-Z39Iu>*ru5j1*Kfe=>ev(<8-)=k(toML+_xm z5Z6<^Mf=-Ijf*Ersi+Mna`q3M-<-T2DtQ%sc+S+B>Dx)TBy-3EdsEw8fua4VxM>0^ z5tSQhGGcXmLA~cAlE#F?HpDUSbC0)KPRdZsL4E0gmMzqodNnr!XLM|p{G0vRzW$>R zpI;riAaq=*E-uFRThJ&qm(ZF>lr8H1;7a@=#Um)joJm_xl@tb#DpR?RdZ?KK;hn7) z(hH9sGx&ra>QQI9S^3@3C-gj}&qc?Bpu|%JP0D*n#r5v;)5FS3i>D-7sIVoYI7P}c5L5DapTT8c%1t1wrb=sY2zTQZ9y9mjeYf7w#jE> zLloEI=<_vpr5CNdJwdBv(x}C8+M^dWr|2Fzesp5f6ive74YyvbB;+lbM37-4v?K(hZq6ZgKXy<}NR3>mlr&Fm}-8xR?S1-?-6( zD!;p;&zi=a&LS)OYThpQ#JXx@M{7~$&GFo2Y)qde_Z=H^WJ%x!`tHP83S)zRJe*2V T-a933_K>p0hwf9K^*s7NDw=0I literal 0 HcmV?d00001 diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json new file mode 100644 index 00000000..a2adeab1 --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/resources/testJPEG_IPTC_EXT.jpg_metadata.json @@ -0,0 +1,166 @@ +{ + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity": "Atlanta", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr": "1234 Some Road", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode": "30339", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion": "GA", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork": "info@alfresco.com.other@example.com", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork": "555-1234.555-4321", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork": "http://alfresco.com.http://example.com", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode": "US", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre": "intellectual genre", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location": "Rock Creek Park", + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene": [ + "iptc scene 1", + "iptc scene 2" + ], + "{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode": [ + "iptc subject code 1", + "iptc subject code 2" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice": "Ray Gauss II", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator": [ + "Mother Nature", + "Man", + "Mother Nature" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated": [ + "1890-01-01", + "1901-02-01" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource": "National Park Service", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo": [ + "123456", + "654321" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle": [ + "Rock Creek Stream Bank", + "Pollution", + "Some Tree" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo": "rocky 1 and rocky 2 are big", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType": "http://cv.iptc.org/newscodes/digitalsourcetype/digitalCapture", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event": "Photo Bike Tour", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity": "Washington", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode": "US", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName": "United States", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState": "D.C.", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation": "Rock Creek Park", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion": "North America", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity": "Washington", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode": "US", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName": "United States", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState": "D.C.", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation": [ + "Rock Creek Park Sub", + "Stream Section" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion": "North America", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight": "3456", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth": "5184", + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge": [ + "1000", + "1001" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode": [ + "ASPP", + "OTHER_ORG" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName": [ + "ASPP", + "Other Org" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage": [ + "rocky 1", + "rocky 2" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId": [ + "100-ABC-ABC-555", + "11223344", + "55667788" + ], + "{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId": [ + "PLUS", + "ORG 2" + ], + "{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition": "DAM Architect", + "{http://ns.adobe.com/photoshop/1.0/}CaptionWriter": "Ray Gauss II", + "{http://ns.adobe.com/photoshop/1.0/}Category": "PrimaryCategory", + "{http://ns.adobe.com/photoshop/1.0/}City": "Washington", + "{http://ns.adobe.com/photoshop/1.0/}Country": "United States", + "{http://ns.adobe.com/photoshop/1.0/}Credit": "provider", + "{http://ns.adobe.com/photoshop/1.0/}DateCreated": "2011-08-31", + "{http://ns.adobe.com/photoshop/1.0/}Headline": "Rock Creek Park", + "{http://ns.adobe.com/photoshop/1.0/}Instructions": "instructions", + "{http://ns.adobe.com/photoshop/1.0/}Source": "source", + "{http://ns.adobe.com/photoshop/1.0/}State": "DC", + "{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories": [ + "category1", + "category2" + ], + "{http://ns.adobe.com/photoshop/1.0/}TransmissionReference": "job identifier", + "{http://ns.adobe.com/xap/1.0/rights/}UsageTerms": "rights usage terms", + "{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName": [ + "Ray Gauss II", + "GG" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName": [ + "Ray Gauss II", + "GG" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID": "supplier image ID", + "{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName": "Ray Gauss II", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail": "r@example.com", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID": "RGAUSS", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName": [ + "Ray Gauss II", + "GG" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1": "555-5555", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2": "555-4444", + "{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL": "http://rgauss.com", + "{http://ns.useplus.org/ldf/xmp/1.0/}MinorModelAgeDisclosure": "Age Unknown", + "{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID": [ + "model release id 1", + "model release id 2" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseStatus": "Not Applicable", + "{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID": [ + "prop release id 1", + "prop release id 2" + ], + "{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseStatus": "Not Applicable", + "{http://ns.useplus.org/ldf/xmp/1.0/}Version": "1.2.0", + "{http://purl.org/dc/elements/1.1/}creator": "Ray Gauss II", + "{http://purl.org/dc/elements/1.1/}description": "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", + "{http://purl.org/dc/elements/1.1/}rights": "© Ray Gauss II", + "{http://purl.org/dc/elements/1.1/}subject": [ + "bank", + "park", + "rock creek", + "stream", + "washington" + ], + "{http://purl.org/dc/elements/1.1/}title": "Downstream", + "{http://www.alfresco.org/model/content/1.0}author": "Ray Gauss II", + "{http://www.alfresco.org/model/content/1.0}created": "2011-08-13T14:40:51", + "{http://www.alfresco.org/model/content/1.0}description": "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", + "{http://www.alfresco.org/model/content/1.0}title": "Downstream", + "{http://www.alfresco.org/model/exif/1.0}dateTimeOriginal": "2011-08-13T14:40:51", + "{http://www.alfresco.org/model/exif/1.0}exposureTime": "0.0125", + "{http://www.alfresco.org/model/exif/1.0}fNumber": "10.0", + "{http://www.alfresco.org/model/exif/1.0}flash": "false", + "{http://www.alfresco.org/model/exif/1.0}focalLength": "50.0", + "{http://www.alfresco.org/model/exif/1.0}isoSpeedRatings": "640", + "{http://www.alfresco.org/model/exif/1.0}manufacturer": "Canon", + "{http://www.alfresco.org/model/exif/1.0}model": "Canon EOS 60D", + "{http://www.alfresco.org/model/exif/1.0}orientation": "1", + "{http://www.alfresco.org/model/exif/1.0}pixelXDimension": "103", + "{http://www.alfresco.org/model/exif/1.0}pixelYDimension": "69", + "{http://www.alfresco.org/model/exif/1.0}resolutionUnit": "Inch", + "{http://www.alfresco.org/model/exif/1.0}software": "Adobe Photoshop CS6 (Macintosh)", + "{http://www.alfresco.org/model/exif/1.0}xResolution": "72.0", + "{http://www.alfresco.org/model/exif/1.0}yResolution": "72.0" +} \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java index 7d716978..91597a23 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractor.java @@ -26,13 +26,15 @@ */ package org.alfresco.transformer.metadataExtractors; -import java.io.IOException; import java.io.Serializable; +import java.util.Arrays; import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import org.alfresco.transform.exceptions.TransformException; import org.alfresco.transformer.tika.parsers.ExifToolParser; -import org.apache.tika.exception.TikaException; +import org.apache.commons.lang3.StringUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.slf4j.Logger; @@ -42,6 +44,12 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor { private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class); + + private static Set IPTC_DATE_KEYS = Set.of("XMP-photoshop:DateCreated", "XMP-iptcExt:ArtworkDateCreated"); + + private static final Pattern YEAR_IPTC = Pattern.compile("(\\d{4}[:|-]\\d{2}[:|-]\\d{2})"); + + private ExifToolParser parser; public IPTCMetadataExtractor() { @@ -49,13 +57,12 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor } @Override - protected Parser getParser() { - try { - return new ExifToolParser(); - } catch (IOException | TikaException e) { - logger.error(e.getMessage(), e); - throw new TransformException(500, "Error creating IPTC parser: " + e.getMessage()); - } + protected Parser getParser() + { + if (this.parser == null) { + this.parser = new ExifToolParser(); + } + return this.parser; } /** @@ -65,9 +72,87 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor */ @Override protected Map extractSpecific(Metadata metadata, Map properties, - Map headers) { - + Map headers) + { properties = new TikaAutoMetadataExtractor().extractSpecific(metadata, properties, headers); + ExifToolParser etParser = (ExifToolParser)this.getParser(); + if (etParser.getSeparator()!=null) + { + for (String key : properties.keySet()) + { + if (properties.get(key) instanceof String) + { + String value = (String) properties.get(key); + String separator = etParser.getSeparator(); + if (value.contains(separator)) + { + if (value.contains(String.format("\"%s\"",separator))) + { + separator = String.format("\"%s\"",separator); + } + String [] values = StringUtils.splitByWholeSeparator(value, separator); + // Change dateTime format. MM converted ':' to '-' + if (IPTC_DATE_KEYS.contains(key)){ + values = iptcToIso8601DateStrings(values); + } + putRawValue(key, (Serializable) Arrays.asList(values), properties); + } + else if (IPTC_DATE_KEYS.contains(key)) { + // Handle property with a single date string + putRawValue(key, (Serializable) iptcToIso8601DateString(value), properties); + } + } + } + } return properties; } + + /** + * Converts a date or date time strings into Iso8601 format

+ * + * @param dateStrings + * @return dateStrings in Iso8601 format + * @see #iptcToIso8601DateString + */ + protected String[] iptcToIso8601DateStrings(String[] dateStrings) + { + for (int i = 0; i < dateStrings.length; i++) + { + dateStrings[i] = iptcToIso8601DateString(dateStrings[i]); + } + return dateStrings; + } + + /** + * Converts a date or date time string into Iso8601 format

+ * Converts any ':' in the year portion of a date string characters to '-'.

+ * Expects the year in the format YYYY:MM:DD or YYYY-MM-DD

+ * Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T': + * YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss.... + *

+ * Examples:

    + *
  • "1919:10:16" will convert to "1919-10-16"
  • + *
  • "1901:02:01 00:00:00.000Z" will convert to "1901-02-01T00:00:00.000Z"
  • + *
  • "2001:02:01 16:15+00:00" will convert to "2001-02-01T16:15+00:00"
  • + *
  • "2021-06-11 05:36-01:00" will convert to "2021-06-11T05:36-01:00"
  • + *
+ * @param dateStr + * @return dateStr in Iso8601 format + */ + protected String iptcToIso8601DateString(String dateStr) + { + char timeSeparator = 'T'; + Matcher yearMatcher = YEAR_IPTC.matcher(dateStr); + if (yearMatcher.find()) + { + String year = yearMatcher.group(1); + dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-")); + if (dateStr.length()>year.length() && dateStr.charAt(year.length())!=timeSeparator) + { + dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator); + } + } + return dateStr; + } + } diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java index 1ffa420e..e43677a4 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java @@ -39,6 +39,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.net.URL; +import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -58,27 +59,84 @@ import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; import org.apache.tika.sax.XHTMLContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class ExifToolParser extends ExternalParser { + private static final Logger logger = LoggerFactory.getLogger(ExifToolParser.class); + private static final String EXIFTOOL_PARSER_CONFIG = "parsers/external/config/exiftool-parser.xml"; - public ExifToolParser() throws IOException, TikaException { + protected static final String DEFAULT_SEPARATOR = ", "; + protected static final String SEPARATOR_SETTING = "-sep"; + + private String separator; + + public ExifToolParser() { super(); - ExternalParser eParser = ExternalParsersFactory.create(getExternalParserConfigURL()).get(0); - this.setCommand(eParser.getCommand()); - this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer()); - this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns()); - this.setSupportedTypes(eParser.getSupportedTypes()); + try { + List eParsers = ExternalParsersFactory.create(getExternalParserConfigURL()); + // if ExifTool is not installed then no parsers are returned + if (eParsers.size() > 0) { + ExternalParser eParser = eParsers.get(0); + this.setCommand(eParser.getCommand()); + this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer()); + this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns()); + this.setSupportedTypes(eParser.getSupportedTypes()); + } else { + logger.error( + "Error creating ExifToolParser from config, ExifToolExtractions not enabled. Please check ExifTool is installed correctly."); + } + } catch (IOException | TikaException e) { + logger.error("Error creating ExifToolParser from config, ExifToolExtractions not enabled: ", e); + } } - + private URL getExternalParserConfigURL(){ ClassLoader classLoader = ExifToolParser.class.getClassLoader(); return classLoader.getResource(EXIFTOOL_PARSER_CONFIG); } + public void setSeparator(String sep) { + this.separator = sep; + } + + public String getSeparator() { + return this.separator; + } + + @Override + public void setCommand(String... command){ + super.setCommand(command); + if (command.length==1) { + setSeparator(findSeparator(command[0])); + } + else { + setSeparator(DEFAULT_SEPARATOR); + } + } + + protected String findSeparator(String command) { + if (command.contains(SEPARATOR_SETTING)) { + int start = command.indexOf(SEPARATOR_SETTING)+SEPARATOR_SETTING.length()+1; + String separator = DEFAULT_SEPARATOR; + if (command.charAt(start)=='\"') { + //get all chars up to the next \" + int end = command.indexOf("\"", start+1); + separator = command.substring(start+1, end); + } + else { + int end = command.indexOf(" ", start); + separator = command.substring(start, end); + } + return separator; + } + return DEFAULT_SEPARATOR; + } + /** * Adapted from {@link org.apache.tika.parser.external.ExternalParser} * due to errors attempting to {@link #extractMetadata} from the errorStream in original implementation.

@@ -95,7 +153,9 @@ public class ExifToolParser extends ExternalParser { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); - parse(tis, xhtml, metadata, tmp); + if (this.getSupportedTypes().contains(mediaType)) { + parse(tis, xhtml, metadata, tmp); + } switch (mediaType.getType()+"/"+mediaType.getSubtype()) { case MIMETYPE_IMAGE_JPEG: parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType); diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml index 7d77927c..076dfe54 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml @@ -5,7 +5,7 @@ exiftool -ver 126,127 - env FOO=${OUTPUT} exiftool -args -G1 ${INPUT} + env FOO=${OUTPUT} exiftool -args -G1 -sep "|||" ${INPUT} image/x-raw-hasselblad image/x-raw-sony diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java new file mode 100644 index 00000000..fc35dbcf --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/metadataExtractors/IPTCMetadataExtractorTest.java @@ -0,0 +1,48 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2021 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer.metadataExtractors; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import org.junit.jupiter.api.Test; + +public class IPTCMetadataExtractorTest { + + IPTCMetadataExtractor extractor = new IPTCMetadataExtractor(); + + @Test + public void testIptcToIso8601DateStrings() { + String[] testStrings = { "1890:01:01", "1901:02:01 00:00:00.000Z", "1901-02-01 00:00:00.000Z", + "1901-02-01T00:00:00.000Z", "1901:02:01T00:00+00:00", "1901:02:01 00:00+00:00" }; + String[] expected = { "1890-01-01", "1901-02-01T00:00:00.000Z", "1901-02-01T00:00:00.000Z", + "1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00" }; + + assertArrayEquals(expected, extractor.iptcToIso8601DateStrings(testStrings)); + + } + +} diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java new file mode 100644 index 00000000..4303f255 --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/tika/parsers/ExifToolParserTest.java @@ -0,0 +1,59 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2021 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer.tika.parsers; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +public class ExifToolParserTest { + + ExifToolParser exifToolParser = new ExifToolParser(); + + @Test + public void testFindSeparator() { + + String testCommand = "env FOO=${OUTPUT} exiftool -args -G1 " + ExifToolParser.SEPARATOR_SETTING + + " \"|||\" ${INPUT}"; + String expected = "|||"; + String actual = exifToolParser.findSeparator(testCommand); + assertEquals(expected, actual); + + expected = "TESTWITHOUTQUOTES"; + testCommand = "nothing matters until the " + ExifToolParser.SEPARATOR_SETTING + " " + expected + + " now all this extra should be ignored"; + actual = exifToolParser.findSeparator(testCommand); + assertEquals(expected, actual); + + expected = "Test something bonkers 112!£$%^£$^"; + testCommand = ExifToolParser.SEPARATOR_SETTING + " \""+expected+"\""; + actual = exifToolParser.findSeparator(testCommand); + assertEquals(expected, actual); + + } + +}