From c95cbaccd94077825b79a75a02346efe6627e4bb Mon Sep 17 00:00:00 2001 From: Andreea Dragoi Date: Mon, 24 Oct 2016 12:13:50 +0000 Subject: [PATCH] MNT-16709 : Metadata extraction on 200MB PDF file causes large heap utilization - added concurrent extraction limit - added max document size limit git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/DEV/5.2.N/root@131709 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- config/alfresco/content-services-context.xml | 11 ++ config/alfresco/repository.properties | 4 + config/quick/quick-size-limit.pdf | Bin 0 -> 35213 bytes .../AbstractMappingMetadataExtracter.java | 95 ++++++++++++++---- .../metadata/MetadataExtracterLimits.java | 48 ++++++++- .../metadata/MetadataExtracterLimitsTest.java | 59 +++++------ .../metadata/PdfBoxMetadataExtracterTest.java | 67 +++++++++++- .../metadata/PoiMetadataExtracterTest.java | 92 ++++++++++------- 8 files changed, 282 insertions(+), 94 deletions(-) create mode 100644 config/quick/quick-size-limit.pdf diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 1a83ad10f9..76a838bb52 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -330,6 +330,17 @@ + + + + + + + + + + + diff --git a/config/alfresco/repository.properties b/config/alfresco/repository.properties index 55e2fae9a6..6b61707db2 100644 --- a/config/alfresco/repository.properties +++ b/config/alfresco/repository.properties @@ -640,6 +640,10 @@ content.metadataExtracter.default.timeoutMs=20000 # Indicates if the metadata extracter should parse shape objects inside open office files content.metadataExtracter.parseShapes=false +# +content.metadataExtracter.pdf.maxDocumentSizeMB=10 +content.metadataExtracter.pdf.maxConcurrentExtractionsCount=5 + # Property to enable upgrade from 2.1-A V2.1-A.fixes.to.schema=0 #V2.1-A.fixes.to.schema=82 diff --git a/config/quick/quick-size-limit.pdf b/config/quick/quick-size-limit.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3bdbd256f30a9015bcbf777b1447e3dc98f459e2 GIT binary patch literal 35213 zcmaf)V{j%w*QR6JoY;9{n@?=pwr677wryu(+cqY)Cg#q&Uv1U5TU)#R>!5M&e_h?@ zqEHkA0+<zjZ0RcD{S7$RLJGh_O>@l5i140;L?;)RI6@Y!&#yWqP@yv&TK*2q5dWglB zcYTz06#{cE-Ti}Y{N^Z=OQ_^Ilg>YlgBOA*n}eP0zpzy_;$5yuH!$DTQ0A|>6fe1% zm{^#b9NBWYn(l{&f0|+9l2bn7VN_Q|9#5^5=G9#0L;~eTjw^r6=k5TW-fa7^Q5Q)c z;mqt!|EnnC|Cs+%3(J2O^G_rFtv;J=?uY7)atBS8!E_b&*Uq4)3 zZ+exzyL&v%^azQB6^mjGjgdk~#Y?t^LZG8H_U!=)1fd}@F`UG*6C4OEfWok#Z2m~m zL7U}4touAO&ZQ?lE^xPH+$V3v7=0>C4sn__Q($Ve>LMm&ld?qFi8(( zcsq5nJOqwjJjUvXUu%zV9)n#g=nrqgx_-B|(^-36J=>4P3H@#7-}d=*qc$TbPzzsr zegJxfdhl5X@;#LCgP7s;92P+OAep~AIpo+P;^`;TGJ7qf;@hBn^(E|Mf zcw&ghqfRK0VM!XqNzWrSgLVg`j(j1N_&zYbVVD)7E&afkplpVXIl!MEOczFIrUlj& zQE$v7H?To?<+tqb`bk5hsl$i&XJEuyiWt;7-PTg6Hk& zN4DhW?I|D2UBkw&DG}_=y#N|&8nzptmB4srDPuo*Qn4=r)n$~>@q{@05TNGc=Bs`O zWK0vpZ6moce>$A{xGAv28tvqPcpE?%0!VsZ!s zakH>(BHW6sn?V2~L5f-7NluVW*?KbqGd=82yCbtBU2`6U+HpN$wGx|i?#gMAw4&_C$WcT}<|6{3iI$g=Nm5KYlPdIDl+uZ;)#^F?J{* zmWJF~FNa&(b9mLo*g_!+lb8M);ZCEw^|C_`c45(zkMQV7$9sVovPhGRado=A%C53rsW&GR?T0 z61>5`5sWAr6HPs`=)zxg6TJxr^+Gr)vvM-Or{nNuZO`-!aksyZVn=(#>za0pma|kOCjNBECWi*P>kR_`ES|N9BGKdK~=dLNbVR z0I3v+ME+y)BKHCP!5M0j9F2L3VXU2T!^)d3Iwf^1cT5nqMq$g}8QYz^-T%Ybz~p=C zPM0!ma{46mcg&U>T(Mya$0MD=M(-t_4X&5r^Eq$VFOgB;fTOQn{@iKcls;KmcK@S z8(|&j8}felPaMG*g*}5mX}6Ov&oBHRG(kpB>$ix544MEpKrGvr z7@S!^3{_=H5=}8sM(K@|)O#)=zauhFW)LySom#wDxdyZ&klXRU(>8@^jb9&M|CzYe zu|$uKzybdPQ_F|bh3fJmtN}+dYj|(pP5gmr{UjrfZ4z#GETGJEc_WNG1Eq3@-$f9& zFCL9pj%j`^B0To@C&t3BdhlJ)alyc}0^@6XFK8F;ztGedV!XF?hB2dxR4xeL5HHFI z?+w_D1IUfPPAR*P%9R3Fn4Swk76&1Oa?=XGV|(Lj>RlK?91_#j!c7om>3F8uKvoR~ z%_;GBA)GftTK56e=y8A0Pce_}*}x^%4SdfCWK3?;ie5mxaC!V09T}Y%nF$^q=<>}O zEaPA##|HDsQU_)pnrMjGhb8+>13x$ep}N2_(r51wWvR~lo}2voLQUc^PMpj7yTRWWRKsFj45fn_ zfx!kv-mbRTZ7H28_8O3WZm8yB1JSaE#`^T)C0ZHAZi)n+ zEwt3Jdq)~8;NBK$^}+;rCAZ#i4rTSAhP_01;mfg4v`#Rb5zC_N7n^OWoNB@z(5@R3 zhGwc+DG^APZXo9%c#Y6rNQPdxr64ACxo-Z#*5I<%W0=*IKb)be#t=f+4$qHU3&b^6 zG&I^jze200NOafnY;i^UxkX1%#D!B_(@D&Z;vhH{wT?)NbX|U z+UP3@4smeCGs}|Tv{{e&ZgY>CV)1ZltT^OqUzoS4^74o!Za11WaW$Vzyj8ULqH(eb z5zVDa)mm*RgNyEhidE~84p>grAU=w|sN*P;(~(pqu3#goCnc7$@5fVeQLXiu$)VSe zw3xvFc20@h+esyiEfT72ni8h;)LTZp;J*Nso`^<|L{{E%dt z@ZVv=43}S03NJ)2cfW%$W6n@=`~fo*zeYPCdmdC@YK$fxHJ`d-EEJ>w9;z;C*Ow4W zB4@UU<%>l_brps&?yb0W&kY6(6YI;;U982RMEN(53=EAs1BJajZcwyfP7w%&imFC& zi*c+8{41`NaUqc&YlzG!0(K}G<=;Au*yQa0;VWo#yOIE79WXVwm-48rF9x_a!Wf5T zM_`sKJ{Id5=}2Be7_p(udsNATd{HL9*O*#kpyIAv^vsWoJYlVrPn!(~GKq-mGz$Y|LmDFFS( z(|M7qnZ*_ev3V=;TV&YC(t+xL6SS1zJoH zzVXIrnIcj%1>-|i#;zSy?Y=z596|LC|L;Oq0YUZ137Ju8sjKIYv05ZImfp_5TMvj(t*`|_Prxeq1~duqNHrJX}W3GWu)logW`4FULyRv z&}sWsIUUCjsT4>V`l>YK=i8A?G_1_Ro|YCR2l;B)nJnx_TvF}`6b7vZbB@n|NWZ%A z)ML0=6O{V6eqp@CY6|k%vTs;xZ)CJ1KVj0jyfhg7K6PpL{yF$^#fCW0kv7oL*@sLs z(w$JPR|`)%XhdhPf<)bR;FZDuQZ|JmFlI@l^d;*i>7@DCb#Awen>JCxu`ZELMcSow z3maWOKmy|~(U+Vn3g?zV=gi9L;tYP)m3oLC4~nmDi2zkY*;fWhEqxE?=j_}^BuO_z z^=hajudk%!tDlb(b@oFrEjQJim6A1Tgo|r+B)yGIBxO8b_?H9~I z13L7!RLDbF!-qGSG$l!x9o{gRz3u#*BIcXYfADm$ynNU={M4#VDGegyr9W3g0yN1+7l8j ztr+C1WD3}!dgbqGMn9wXy?Gm79V4kdLhtJn&R%|BD13ttzzTy)PlL&*753NsAY(En zP7ZPQ>ty|VNm;$~NAGUeV}y?fsYv%-*6&e&PvPXCI<7#G2FB1BiZikXM$+MtM5?*j zZ3XhCsCj7(eC-SEgD}?LHa<^)Q~6coMvoG{^w+^l36&?aFZ#`gMdF>D9*wQ685V1+ zZ|Ztdi6X^vcF>W_{P)qf!bNL0E^zat*hB0hi0t?xImtOIaLPmJY3JbSj{5cF_@ih4 zCVcbc3g2e3eCYxC7!`sgQOqK~|I$Pw3*zF}b-yoFbl&)0vBytU8A-6!1E?`| z$UFfaYa_+$^)hx#iX5jdOP<|8LMBg&*5BA`+xB7ndvcFS-dNHDQtZaci3SM8##u@z zpGL`*ExZE)b*tqo!M}=>Z<~15u*dAl;N+8Lvcw)~Fx62XQ{5Xp=_rl^=6s{iIma7) zY$%7=wZx*x#Nr|rlhfA+XfVgIW)4NoOZG&Qq^C$(uW(HvQ`e1e$XqUcei^(*`GZ#& z1_e6Bq99FWPfZ}0_|@MGUy#X^OMBM(^jCR#pr0sv<7enrtEz-<@QNJhK<&5rzgBLLh!jaJde`F9nF@mj^ z9LQ`xv|^Yre3(ZW^7Ue!am8snkcM7>?-C#KXHpPaqQ8u(`|r$4%7SP47e81-&`&}y z;SOtI6Vv=zr1uscllBzLC|MBw`ku58`E#vZt?$usKRGoDEfqVp{g1=f3Yp=N!=nnD zXq#-p@iyMa)`T@dQXi~>e&>|OjYG8xgWnj^X-RDz3$Ly&jW+PTS-rCji;v-tsC|f~ z>2koby^(M=DlUHaBG3j(!j9=f1&uQ#XX#mm%ddO}l4|?4gTYN7D(~po6q*+mQlQ*x zE{3h_U5PEU!U8VOuT_jse^Dnyr(Bp5YNeSIS#vK~7)nZ-v9btLA@FJ&dWMnvT!!Mv zHR%UOY3qr@>m0>7r_U(?iPMlPtfU;J>)z+JCI^9)`+H+Lhein^-@Btbc+KbgI?bT4 zxz99#v~7Nm+d4t-mj|;bh)x=|6YZ&VI!J_HVTW0 z(q<9v5>pp&=@6`5g0)!5I-5224^e^eVV_-NeS8kk72Ch)M1lk~fu>|6_|1lI4C&4% zV$JrrQPi=VCy>5j5xTqNC>2$bpS5HugR$g^J`)Mv>L`C=Z_%HV&YrKGn`9GJd21}s zmH06j?86JGe6V6&#`S3RAaF=OuvQu*2*(2n(9i^}Clvz56$TwsDgy6>MnaJU1{p;q zw~LAaVMA4(6gTn}0B2$#o6fT)?Dnb`IlDM1c>ASu!ftdE79u7r35 z8&@hi&o{Qb$Sv_{cpgPf9;(qOb5!D1bb3kN7brb=AUJd}+9vB6V(>*ZL%tc;ci@Nb z5KhOu1)a$a|FFukrt~8@GzkfZURC~;Am9X$pTN&STQ);bwP*z%bE(##NmY7^??(3f z(J|71?CsCe65hBWzvG;Wm1~4=(S3BHQU^<*nOI+B^T?Hc5>ly? z@SO}nX}&LID!&=f`vblcE(vh*Uyv`2Z zT%`~2J&oS%xXgBv`x|_}{XNJN^m6`aIw8s!sKq9FJ0Kzo6sm?l?L4}KVme$2cN7Ji zS)yO6Wp-fOweL}QC5xIMBOLZ-oah*}rU(phNKkFi&k%y}v_Ugd z4V{#LYp}0D;M(Wk5<5ZKcNVn2mxesj(mgv#uW)V0Hb|Cz)=5+6gIFZ}=1+>NW`z1- zHLT`iIu!N+s@NG%uscjs3PiQF-RoBKl$wDdGZ~$62fo&)om~uNH%Hs#<~s7boowfg z{t9TW%Qu9K#jz#0)hmnlujlq_j3tIQZey{GQ5acv*>d%pYiYMK9h~Wc)L-SEqGes` z%SOh9vdI^U9+s;(JaS5tw4EF07BRkvRg2{hzG72h^IGzY5WI3^P&V@@1AG(Wa(k!I zOj%?|O$NB2Y-SLS9Vw$JWyuyPc{G?4s2^X#5+eubwG`B;^9>_zL%U=;5%ocda#TrI zI4|BO2`IdN;n-*p?@Ep!fQDyeye`U{;5wx9h@Thvb!0qQeVvn#Xc^(WCd{6bZeU{8 z05MuL;a92zvHDh8VT(}daNEFfeaiSTQXI%YCl+HUp(;qx3QE*LJf9LS@`~4S1#rqv z8sEgM#*p6&cS>tN{05JcW{oEo1h3ZrQlZ^`p&#TKE#9H8v3;%A`F;>t_&)5dv)1cG znlNAM-##p^O^uORwW_`&uZ`Nqtx2SlwTim48~-#L}nezHr8Y> zYvthzV~>$KS*WgfGOk_o#P!HB%8}EUQJxfE$$}|RnSfz zBB!{LmXVg0gMV3!*O$U8u3NHQuwASN#wau?4*FL1zq60BTS$VgSh~}&;ZeLS!^yj% z+Nu~E$FefdA0dRxkIIjjV@nvPN!-^E(80mQg4X~vYeeuw3I}05hsF&T`^6 zry|wy4cF2SUg&LueCN0<0zT9o83G9^2h(c~nk$TY>FBiwpv*LHu*HLf>n`05N9%G7 znXbc&^9g4p?GsoNv1na^i7F)04tGa^S;-^JzL*T>EXmU1FAo zg{Bih#oqsitwm+}&M6H-R=dt%xUs@^D|wtDd|27AL05HTXT{gF_%p)Qctn77zJ(XL zHO5v)R9=z@O)WDNcXo|?3NO5|IILeSn&}DluMnSt_ltbJ)ud1m*1>c_h6DAkA}du2 z)$-KfxzSO5rd}DIDsxRGhg(+G_LZ;Lxb6*aI5KnfXxqpr@f35k24Fk?7Fabqp=ony zlN{y1K#|59pj)*xrr+l3GG1gVkkNknwxFm=re|;*7M1csJ=O$cA47o}ZS1L?xo*2) z-CTGPjm{e}IJGDkY>C83%4J%#wEzF4%-*3T!NVqG?O-uV_o(b&P1Q{;CZv z4H?#0l<^ltMJ<2zT}`}Lk?S5TcC(!<4Xt#&S@~s>C%kt<%~Ec27i$uJd=wA*aB;C zKyOAvoP=_cv4qBB&c4LBHp6Ew>BE$ZR-nS~=Vyd6G+lG{nFRwqfC$_x;Vq+YnJG;? zQBbja8u3H3ugPg8A7Kxfy~$h4{*LG;=5H+`2TSjeh7uycv8(Z0(>KMj90un7<18n# zFP_n2|J)+jS4dBE(WyMO5i@6|N4(LFGt9Lla$o0NJ$w#qO~Sy9`@yFc{M)PKzR-sD zMVfZ2=^&1lY1`JOrH>;0Ne5Nk*0b$E!Vhj`Nw~wM5P}bCZ-z*WtfQh@?k9ZbX7))59m!|H;37b= z0RbDd)SJNF3-M>1k%v{rC1}6^m1iE8fv_)B^9gGKGf_Zl7gpw-zU817Vym+N(|jZc z>wttZN|+o{P(!^T`O}G!B*hhRj`Z0OobinW+g^P}*IpIq%6q~jD3g`l6HC&HDMMBD zZRR9%1g(Xq>`}T{3hOqX5Q-X{ya_jDf#`%ll1xq}dU3K8<;s8K zU~|?MJ?Ei6=LRaX3)_o-MUx&+A#%pyk7oiPto&LmD;XUubTyAsFTs$N1AtyePq5? zzp}E1O?7b_n@_XDnl~|%7|K0K0r~r&(}iLMkFz?L41o3c)0B_XP>)T+PL)T* z51eL5>uRa!YvIx$n;ubj)2wS*Mf{*pH=BwMR*GR6obC9)L#)c=HUT;}s@X`QT(ySR zQLl1Xn%EkYl6wv>VI(Tx${H#6(2H^#|I&isprqy1cdv{FxXgqEttaCnfryl#OKq{o z+fPVKl(;mu59tQ>q&PS#)jDmja>GkI^GfH0MCCxO+n_8i5X#r&s+=Y+wyP4UQ|D-R zBA3yyc&`q@s4I=5^PxwKg>LR8$Cij9=G`8hbN{fajG(CUq(=*Ar$DXLicw zkvaqU)puFtyO{_`66(c#3O4joA&SXv6#5fvcpFUFFDa{dZ+DNl-Rm6&bPATcO5H`j zI3IJmkW+{(m*qK}rpF(wS}lJ5gf0{5acpE6c2G5GsUk+F2-h#N1!)(v_*$Ce@U+73wZ;cf#DPdq z;AVeao@HHImI{8!ES{ikr(p8>QA5DNCh>-Jtq~y?>pwii@ln@LAdjDwtgrGUEnd%9 zKO=74zW3ZUSbVvT(y4lRD71^b&tE*>@c4MHA}cfjI&1ak{VJbte_mENnSIa@W0&%D z99*l^+4a7+sN@&O56{#e^mWpHJdfFlw3h`k8wB7fN-PUF?N0eI`VQYOWPjIpS$Q>G zBD)B*X48HA&1M9gz<;xkRyR@GMY2p2#tl?j#t5lF4la(aA$8s&)iFjdhp15 zsB2VennkYEbpn0Ic!wLf<3x)-(p{@~I5;906S50kLI>c}rz- z9S_gz@ZEJnIQS6l*~Nyqc>X8Y%19p;UCJr)Sdl*UbDG>_EKh=x%p$HQDKT(bc6uRI zS^G?aJ=NfdG9~fa)Jh0`PGa0H1Q|a3ecJCn_Q6tLisVsZ#2bQ`ryhW1 zi;kLZqOVF{E=4LJ`%M)8qbZ$Cp!z7{x*jh+fb$SXa}X@4zb^}HZ(nM4n3Cw1Kl|%% z=|SbDD%D!UWa(5`OB{OsTv`vw*)l9yzFZb&oG9}okP{=4@OVp9h$!*;bmoN#37@8trQ5M=`riC3{9aCOMXq!r-X?yxiD8V8*lA3n z9x>k^wCB(vQeBwXln1uws{6|O;tV;?2h18QICptNf3e!lXu(#Is+}ktq ztdVWbz9SnyVTjxAt6qa~i9;WJ?qFb>%l?ju}|M!j%d5 zKnM~r{~DB%s$?3Gta#>i8?f7g@ns`5HLqkhj{N?0_=mr+Y}AF|V-oc^C`fCF5Y){v z_J{fGqJ{&t(kQS*qEif8xA=6NX60z*+?62)*=pPKOb={r`EVj1briXg^3VRNQ)w5F z*n!d;qgN91=Urb%$?Pl!Lf0Vim&Z9RvTC6s-H?rZva zS$6AQ3D3g0+8x8r6enZYn#W>M{63*X(~_gM0s2YYLD^wx z2b&A=t$nlSY`)3k7Eh*)8D~b_0&+j&pQ$P(LqH=6^SEk(f-`q%@7Qk|Z$$Su&qU98 zcU7OY#_z4Vy@^X6bVSNOG;)=krB9}!O;m{XritF;dpL->J05Zv?v+<9<;mO_XC|Sb zWz0uh8ZYz|e*7%*b5DBPQ{BVvsyJy>nBuPASPx$R=MJ*z!1h}YJ|XEtG2wGXm&~01?&3bS&d${KyUJ~y*wbh_7pn7`xSruiX-Dqd((dkXAtD& zWfVl2XtTtzhR2N5(=0G_6bed^$p%j>dRZKSF8Lpin^^=h1i6rJMG}#H*NFgj}@g50j zXLq@ezCga-0dcr!j4lz?2kIGG0TpV@nZW3xC7?28o{jFPA516d*Qr}%LGtd}di6}H zgMw}94_fcVreFsfz5}fQ>~6sJOI0 z!%NfOeJA@NmkkQ`1;>M;%e8RyHS31Uia`zW+{78$L2~S`NW}AfoB>cfGGDdWX8D5r z+>Ay1K}xJi4#v_`YBF^nch>~3DS|bC_);r~-mXW(OalipMvidM-aT>8oO}^tL-#x& z0lD8@WCI=Rx}YRu{0$dQWUuYGUxU1FTx1U^S?>s+C(>8Wcc=@NPfMfw80E!45Uh1m zCUskdL$;~^COuq*gm75#kN^*kDit?X-ljZ3Yz1ZVX`VxzPHn}FnsapV*7d8(%?*eB zhu*H&^St-$Chv{c`_;R94>OeFwMK)PeAIIou`Jnjl502TPXg*D&|&#faYz{Lnr)G=?x9n zxm>({g>}Z82>pHNJpri$wj8BEe<#^2qqXS2-$Zfo9nWFu>{M!FrK+Pdd?~{RY-@Gv zCmVSy3_J~`Rs&CVBkD0i5hVy@tPoxaS`UYrUb})@)$yvTh7g3y+Y+Op~dsXzw<6$#JF`rCpD-c zvf{xSvPuQ7h1doTsXLfb&JWe;3PkxIpZACrtyiG33VU^21|aKa)h}qiwQCoZ z`gy%M-lt3_`9<*fCV|n&f><8z!Vsq-B=quT-=9?C$bB1Lt2N`3PCk8j6Z@Qmy+9W@*_2~HCepD-RSv3$+*DyvEo8(FvVaYH?`iZVNR77yE zfpRIj0e*<6L|tWs9WPj)f<^V&0MHAFIo8~u1!Zf7XtLx+8eC&*8Iyjd(;0jcSSK)65U>lTX`C>l*EnrX|S&_7C#f=Zg&lhl=>_)5r^hic`xEh4mq5+q9n?r^gb z+m?V}UtAfnl=utC^r30R9bslM>a*adbr7}r`{VDyTxUvi%0ddo#Rw=$$G3djAH@ql zf0aY@wC9JBm&d273q{FGVvA-iGlltXSTjGYLKk4yg#C`0F#>TWY?c*+C*BrnP#670 zHC6STm@)g7sorFn*FERwP4UWv&ccd16ZyBG8PmG6pa;KvJ8q%3G_R_sH0^8+?V2`6GO3fj zEeIFGGiitwQ4b>={?eMeCtlYTE>Qwc_0%uF4=F{#W#?P-~b5BqQ@jhd$= zpH!0mh6`s(f&kC6FHhPF1DVTp>Jgj3uCH{b`=+1EI|sPZQO~$1b4_cnStA zCy!Y&q$SB^!Y^W*ze3t{)Lkgj;^8*LJ%Oo71_xST%n2p!`Q4Rg4$JWiJf=e0cw@CU z`3u{A5hHPFGM)ut))hZ~2|8rY;qTf%%RASJuC#-zM(%rAl9gC{%Qo&ms<@7W*wKwE zc?%@!x+!^EQJS`SC})H#+n|(~Rar64!;TZPY!Sz;AU$5t=yOhy)t2lxem*8gTv$sH ziJ$#Ajkj2lDQO5wZBs^UNtp0rJlfZouhP-i;9SN@>b8x4|ERp%F!hl+tPi=Sv04GJ zSgcI&`(cJDpH-HsNH6fFNe`EhdB|F>Ji6W>&V6pssC33Xt$eqbSg3q9r0ze@=W^%| zz(*)70wRj7GS8wtsm*xHIcz`G`LNBjbY)rww8!LPWvY$vOPY%r$$b`hhXu#7`>u0H z+ZNv5zGb8n*4@TE9hEKQpRiXQd={HmibJ!9E>mle<_iGLy8=8N%|ER4RZ6lnf081i zeU|7glTAwKN;9SwIB3rodruYF^d`Fm4-|j9DgG$&8P#C5lJKn9$(;z=i8|$9XVo&` zY=N|EtkG0>ybh)D+}m^%+{tUE8rRojbTM&`8-DLhjzeoH#(YpAocm6B=bC9ajwg;Z(u8}lakh7NPAjHe9t9O$n)Ev$?f|+m03T{1@)2g(r9}m_q6hO zrTnE_bM*)2iAUYlbr@+wV`p1gXIp!3kyp*y;(8`Po&A~f8(WgQnAJ(j;{=4}4E>}g z#QhfqMY|RXrg;=*PR{;*?DEy8&;GupL9>TXb4{aHejC{a9YeUiCV?LR-1-*&jrU4& zrPf7Jw~uN3$7P}}vz@Pmjf025bLH0q4VQ+Ta6(jp>wMy?oAWhuLv33-U}Jgf-|^k& z_}HXRKn;(FM+vVYJL6I92KJ53UD<`}fR@iWHG?NHQB9}k^PS_|rX8q-Dx|u9iL6zuPKk)oV;Bf2CF`~_IYK*bC zj)TNs?S@ip>h{d$M7(gB!kl^abdf5ULo#&KVYme^2^*KP-<1bmOoj>)em0$Jz|mQZ z&M~phRp2-ahZyYD@NtWsv4MpP{FO1Lp273Rilu^}Yil#I=jjbrkzQZq*XdG(2lB%+ z!oy<51291I(h9&?Nu4uwEfM^O-4#mlhI)4 zB}0bmdd;>T_tmFf4!7Rt;V8FD$D4Yq^r0sHcD$T*k69JsygYTx0wp}w%5+tRhL#ei z!yhmwqnMw=T@&Y!GL^%T?NL=#CmM|9td@uy9Z#_W9c&W_U_tZ`5AM$(d7NnLFK|x_ zM{fRyNDtz$VI1zr@u@Z27Zu3})pp=`3BLa7dOpQT<_%3n>>XHhSsKs4qF|hLV{kSb z2Yn*UPMvT`RBMGSgaqsOqH>%o{~6EW)>wVRpC>>kH3yH?!!Cc=&+V!r3^Q;;`Boe7 z_NyyP+|#gHqs-t(6|Ce9>^zLI83-CmJP#Bjz1=PDxvd4*#p{cxCk4A40E6Bp?$t$6 zR9Tfj`*;>aaH-biMMSH6HP~5~=@F@-lbzq_#&WZri;fO-0(^XuekC%CloMkKAbQpB zu9%0ngXfK6y!fZs&KvoBA(#|=r_sd&LbI#;LraoVK4Zj|-ZZ{=x?p{!1sTZ}c}OX; z*cr3K^5K5wsy)UkqS*T>lWW{VZHmSm-E0==bBhPEYiob4<}T{CklEhC5jx+VX*0NM z?htbzX9F-D3t?_|YeR6{q_N)o*Gg4ShYrtC1XN8vvv%QKZ20wAo;XS>Lj_9HQ1kvk z#@YT<*M9ZZz?N=|xfKB8`1lADON6h_Mu2G^ZyNMwjyKU+UaLSPiv_}-$`j!i}L(Qj;zDql^ue5OO!lUeB`g`2HO@?DYrDgW+soE zTDElUIff_}fy6v77oJ)|&j|DU3+J16C|@2}^FDWvc0G8hyF?-f{b#+KMBfDBZVl*{ z5r0Y`6Hq11=MN1U%K7$zbljIs>W%jW4Fa=t!GuCm z?F{PzP156P(iY7w)dpXC4lv~|l}|kdW1j2X4mtg-x)Mj=t-2ys>P+WJ+>3UOgAxLCeVSxn{Exx5;Z`8R1sa69zVc=+k0}2r91=icN0l$eTi#KiN6{gSqfN+0G8lMaQe>zlNPpQkYF*g0u@T)pT%GF7P=G5c4c>C+?7Fl=rnhrXuE85T84GlVREX zcM%xH@h#Y;F7#7(ssxl!+kJv^nl$EC;WcF8{vQDjJl-mQvY>WBygzR-|G*QNcK3B( z-EKqecJ6smdEfZosPBc=3%c?RRnwm{Y^XNZoE8**ijPLAj;5Qxa4^fE0vgwaC^F@& zH6UjlEg@OQf=8ahZ9@KR!3{>yA#|$axYsf*)iKd)8kK7qjcFO-^aD`6 z#%{ynjl9Mj)y9~}OsU~;;Y30o;tK|X@o!n`6MBp92=Go8xr7eHx@_0{*%S_{5)2U0 z;#81d9iDvli2mHx-(Y=*<0an6v{#jo%fjV79KC70vy>!a&`8}24hkb%((Ht(a$$_C zb{A_`M3zgdu2<5lWzwo)s=lfQZd4yH!&-N1SE*N}Sj$hBCuDvE8j_3cY-1$zNhK3R zaKrsYh`>Dy(T1wRb>+rutZ$LMO630Sekw*+X@?rtqlre)zd@IVOcUsG8{dw#yfW>C zFpaPrTW%Z`7Ae{ynklZaO4RpK3pIww+!kY~sv#I?Qhch;`O1hFOB~JW(7gyKLqsf= z`vgVW{bd2(Si|U%oHZ#+!LDY2%07OVyld$w``Hi0k~CECZk@J-%%8f(l&s*dX)Fp* zjQxm1l1Mi;2f8hzno~;QoyS>1XY8L37whtBE;B`hiBZ?_1b&bg{FEgBDcJ=xJ2^%E zbDTWkwhLpH7xAXg*3WGjv{x{eHEVw=H@3->XU00RNqfYIg5Vp2U|H+1jLUa<2YdAV$3gW9-}%|#1e0#qi36$ZH+WiLXpB()9A=^vXT|jJ0WL#oHNWf*qI2T=-Xa42{?puidur2aHU~dLM6m~ zZ|;~{Y}SD9v(g8Adt{igiX)P3DuE zQ6C7$-{K%JHzLOA5aI~EJa21YAlmy-Htqu+{;;A@I6gy_4|bA?QrcSt=1S8Cr%uep1ji`lYGi9= zB5ZGAYexLfRsH2?_77Bduy++R`*)$Xa&&cY{s(ve^W}}~{$)gj|1;=B9o(F)%$()T zJQP*rRN)wfT};gEU5U9^S^iD7HL`FaX8*5M|Il|c7UKVarKuyPVJ|Krp3;21@X93{=HEG%9B6Zluke+u}|rmRd%|6#}f7lh9C-*x=I$a+?G zw*QSe*GrVK4`MSgLKeCtX;F6 zdjCa$x!lgO(R4cf*a$VNZI>KW(8~*Z3CC$S;7=uMbw(5}xX#XjP*~CP0~%S+Y$Ml9 z3x^4-TLCu6gG`4|%`b#X*V_#)n}f6UjowtCip)`C# zK`J4tQ1y=#B^lYfv%53C<@fFVwxWye!#b(oE-S_l1fL<*!@FvsFB>sWd|%x0o9b=0 zI@PAyPt>a9y0@#JinUUWKD;axx5cYs{XNLqGTwBD;KriGJT01%`{YkOVwG@W@5)39 z-leHseBcMk>kANNlJpy_J^a$t?&?BHn zK#zbP0X+hG1oQ~#5zr%`M?jB&9sxZ9dIaE(phrNDfF1!o0(u1W2eBcMk> zkANNlJpy_J^a$t?&?BHnK#zbP0X+hG1oQ~#5zr%`M?jB&9sxaKH$5V^|Ga(2dIbM* zd|!Hm^zDU!kNCg&2r2E;+V~OX6z>fn!M*i&fP`dFr9S;Ayz!Vk3MtG+IfR6CyqA`B zIV=c!sV}_IadA;LLbeQt1yh(vH}1=7K{$srI)#voR!oItnu0z|PZFZh4s6CF!KGf2 ze__;>-&%{gyBl#@>q58AjH<4T1jGFp=dsA5fQFlv4aqWRlw7+q_B71Il(VYOW~CPM z(ecsb18W)97&l*&@mZ(1bh>y2KQi>eq-Xs_K-Irf}sFT2gI-6}JO1KBAv zQ!um*J8q&m^joFsARqfq^WTz|c}dH6GWe)avwqbFh* l;s--rQ**DWomus{ mimetypeLimits; private ExecutorService executorService; - protected MetadataExtracterConfig metadataExtracterConfig; + protected MetadataExtracterConfig metadataExtracterConfig; + + private static final AtomicInteger CONCURRENT_EXTRACTIONS_COUNT = new AtomicInteger(0); /** * Default constructor. If this is called, then {@link #isSupported(String)} should @@ -1219,6 +1225,12 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac logger.debug("Extracted Metadata from " + reader + "\n Found: " + rawMetadata + "\n Mapped and Accepted: " + changedProperties); } + } + catch (LimitExceededException e) + { + logger.warn("Metadata extraction rejected: \n" + + " Extracter: " + this + "\n" + + " Reason: " + e.getMessage()); } catch (Throwable e) { @@ -1968,23 +1980,29 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * Gets the metadata extracter limits for the given mimetype. *

* A specific match for the given mimetype is tried first and - * if none is found a wildcard of "*" is tried. + * if none is found a wildcard of "*" is tried, if still not found + * defaults value will be used * * @param mimetype String - * @return the found limits or null + * @return the found limits or default values */ protected MetadataExtracterLimits getLimits(String mimetype) { if (mimetypeLimits == null) { - return null; + return new MetadataExtracterLimits(); } MetadataExtracterLimits limits = null; limits = mimetypeLimits.get(mimetype); if (limits == null) { limits = mimetypeLimits.get("*"); - } + } + if (limits == null) + { + limits = new MetadataExtracterLimits(); + } + return limits; } @@ -2027,6 +2045,19 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac { super(cause); } + } + + /** + * Exception wrapper to handle exceeded limits imposed by {@link MetadataExtracterLimits} + * {@link AbstractMappingMetadataExtracter#extractRaw(ContentReader, MetadataExtracterLimits)} + */ + private class LimitExceededException extends Exception + { + private static final long serialVersionUID = 702554119174770130L; + public LimitExceededException(String message) + { + super(message); + } } /** @@ -2049,12 +2080,34 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac private Map extractRaw( ContentReader reader, MetadataExtracterLimits limits) throws Throwable { - if (limits == null || limits.getTimeoutMs() == -1) - { - return extractRaw(reader); - } FutureTask> task = null; - StreamAwareContentReaderProxy proxiedReader = null; + StreamAwareContentReaderProxy proxiedReader = null; + + if (reader.getSize() > limits.getMaxDocumentSizeMB() * MEGABYTE_SIZE) + { + throw new LimitExceededException("Max doc size exceeded " + limits.getMaxDocumentSizeMB() + " MB"); + } + + synchronized (CONCURRENT_EXTRACTIONS_COUNT) + { + if (logger.isDebugEnabled()) + { + logger.debug("Concurrent extractions : " + CONCURRENT_EXTRACTIONS_COUNT.get()); + } + if (CONCURRENT_EXTRACTIONS_COUNT.get() < limits.getMaxConcurrentExtractionsCount()) + { + int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.incrementAndGet(); + if (logger.isDebugEnabled()) + { + logger.debug("New extraction accepted. Concurrent extractions : " + totalDocCount); + } + } + else + { + throw new LimitExceededException("Reached concurrent extractions limit - " + limits.getMaxConcurrentExtractionsCount()); + } + } + try { proxiedReader = new StreamAwareContentReaderProxy(reader); @@ -2087,6 +2140,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac } throw cause; } + finally + { + int totalDocCount = CONCURRENT_EXTRACTIONS_COUNT.decrementAndGet(); + if (logger.isDebugEnabled()) + { + logger.debug("Extraction finalized. Remaining concurrent extraction : " + totalDocCount); + } + } } /** diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java index 05ad512624..1efdabb76d 100644 --- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java +++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterLimits.java @@ -29,15 +29,17 @@ import org.alfresco.api.AlfrescoPublicApi; /** * Represents maximum values (that result in exceptions if exceeded) or - * limits on values (that result in EOF (End Of File) being returned - * early). The only current option is for elapsed time. + * limits on values (that result in EOF (End Of File) being returned early). + * The current options are elapsed time, document size and concurrent extractions limit. * * @author Ray Gauss II */ @AlfrescoPublicApi public class MetadataExtracterLimits { - private long timeoutMs = -1; + private long timeoutMs = Long.MAX_VALUE; + private double maxDocumentSizeMB = Double.MAX_VALUE; + private int maxConcurrentExtractionsCount = Integer.MAX_VALUE; /** * Gets the time in milliseconds after which the metadata extracter will be stopped. @@ -57,6 +59,44 @@ public class MetadataExtracterLimits public void setTimeoutMs(long timeoutMs) { this.timeoutMs = timeoutMs; + } + /** + * Gets the maximum size(MB) allowed for a transformation + * + * @return maximum size + */ + public double getMaxDocumentSizeMB() + { + return maxDocumentSizeMB; + } + + /** + * Sets the maximum size(MB) allowed for a transformation + * + * @param maxDocumentSizeMB + */ + public void setMaxDocumentSizeMB(double maxDocumentSizeMB) + { + this.maxDocumentSizeMB = maxDocumentSizeMB; + } + + /** + * Sets the maximum number of allowed concurrent extractions + * + * @param maxConcurrentExtractionsCount + */ + public void setMaxConcurrentExtractionsCount(int maxConcurrentExtractionsCount) + { + this.maxConcurrentExtractionsCount = maxConcurrentExtractionsCount; + } + + /** + * Gets the maximum count of allowed concurrent extractions + * + * @return maximum count + */ + public int getMaxConcurrentExtractionsCount() + { + return maxConcurrentExtractionsCount; } - } diff --git a/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java b/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java index f53fd18d49..1e38c9dc74 100644 --- a/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java +++ b/source/test-java/org/alfresco/repo/content/metadata/MetadataExtracterLimitsTest.java @@ -1,28 +1,28 @@ -/* - * #%L - * Alfresco Repository - * %% - * Copyright (C) 2005 - 2016 Alfresco Software Limited - * %% - * This file is part of the Alfresco software. - * If the software was purchased under a paid Alfresco license, the terms of - * the paid license agreement will prevail. Otherwise, the software is - * provided under the following open source license terms: - * - * Alfresco is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Alfresco is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with Alfresco. If not, see . - * #L% - */ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2016 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ package org.alfresco.repo.content.metadata; import java.io.File; @@ -214,15 +214,6 @@ public class MetadataExtracterLimitsTest @Test public void testUnlimitedTimeout() throws Exception { - long timeoutMs = -1; - - MetadataExtracterLimits limits = new MetadataExtracterLimits(); - limits.setTimeoutMs(timeoutMs); - HashMap mimetypeLimits = - new HashMap(1); - mimetypeLimits.put(MimetypeMap.MIMETYPE_IMAGE_JPEG, limits); - ((MockDelayedMetadataExtracter) getExtracter()).setMimetypeLimits(mimetypeLimits); - File file = AbstractContentTransformerTest.loadNamedQuickTestFile("quick.txt"); Map properties = extractFromFile(file, MimetypeMap.MIMETYPE_TEXT_PLAIN); diff --git a/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java b/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java index 574b696463..7a0e1682e7 100644 --- a/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java +++ b/source/test-java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java @@ -25,12 +25,18 @@ */ package org.alfresco.repo.content.metadata; +import java.io.File; +import java.io.FileNotFoundException; import java.io.Serializable; import java.util.Calendar; +import java.util.HashMap; import java.util.Map; - +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.transform.AbstractContentTransformerTest; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.namespace.QName; import org.apache.pdfbox.util.DateConverter; @@ -42,14 +48,25 @@ import org.apache.pdfbox.util.DateConverter; */ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest { - private PdfBoxMetadataExtracter extracter; + private PdfBoxMetadataExtracter extracter; + + private static final int MAX_CONCURENT_EXTRACTIONS = 5; + private static final double MAX_DOC_SIZE_MB = 0.03; @Override public void setUp() throws Exception { super.setUp(); extracter = new PdfBoxMetadataExtracter(); - extracter.setDictionaryService(dictionaryService); + extracter.setDictionaryService(dictionaryService); + + MetadataExtracterLimits pdfLimit = new MetadataExtracterLimits(); + pdfLimit.setMaxConcurrentExtractionsCount(MAX_CONCURENT_EXTRACTIONS); + pdfLimit.setMaxDocumentSizeMB(MAX_DOC_SIZE_MB); + Map limits = new HashMap<>(); + limits.put(MimetypeMap.MIMETYPE_PDF,pdfLimit); + + extracter.setMimetypeLimits(limits); extracter.register(); } @@ -107,5 +124,49 @@ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest assertEquals(52, c.get(Calendar.MINUTE)); assertEquals(58, c.get(Calendar.SECOND)); //assertEquals(0, c.get(Calendar.MILLISECOND)); + } + + public void testConcurrentExtractions() throws InterruptedException + { + int threadNum = 10; + final CountDownLatch extractionsCountDown = new CountDownLatch(threadNum); + for (int i = 0; i < threadNum; i++) + { + Thread t = new Thread(new Runnable() + { + @Override + public void run() + { + try + { + Map properties = extractFromMimetype(MimetypeMap.MIMETYPE_PDF); + if (!properties.isEmpty()) + { + extractionsCountDown.countDown(); + } + } + catch (Exception e) + { + e.printStackTrace(); + } + } + }); + t.start(); + } + extractionsCountDown.await(1000, TimeUnit.MILLISECONDS); + long rejectedExtractions = extractionsCountDown.getCount(); + assertTrue("Wrong number of rejected extractions", rejectedExtractions == (threadNum - MAX_CONCURENT_EXTRACTIONS)); + } + + public void testMaxDocumentSizeLimit() throws Exception + { + File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile("quick-size-limit.pdf"); + + if (sourceFile == null) + { + throw new FileNotFoundException("No quick-size-limit.pdf file found for test"); + } + Map properties = extractFromFile(sourceFile, MimetypeMap.MIMETYPE_PDF); + assertTrue(properties.isEmpty()); } } diff --git a/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java b/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java index b8d1cfabcb..e1897cce94 100644 --- a/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java +++ b/source/test-java/org/alfresco/repo/content/metadata/PoiMetadataExtracterTest.java @@ -1,28 +1,28 @@ -/* - * #%L - * Alfresco Repository - * %% - * Copyright (C) 2005 - 2016 Alfresco Software Limited - * %% - * This file is part of the Alfresco software. - * If the software was purchased under a paid Alfresco license, the terms of - * the paid license agreement will prevail. Otherwise, the software is - * provided under the following open source license terms: - * - * Alfresco is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Alfresco is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with Alfresco. If not, see . - * #L% - */ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2016 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ package org.alfresco.repo.content.metadata; import java.io.File; @@ -48,8 +48,6 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest { private static final int MINIMAL_EXPECTED_PROPERTIES_AMOUNT = 3; - private static final int IGNORABLE_TIMEOUT = -1; - // private static final int TIMEOUT_FOR_QUICK_EXTRACTION = 2000; private static final int DEFAULT_FOOTNOTES_LIMIT = 50; @@ -67,6 +65,9 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest private PoiMetadataExtracter extracter; + + private Long extractionTimeWithDefaultFootnotesLimit; + private Long extractionTimeWithLargeFootnotesLimit; @Override public void setUp() throws Exception @@ -245,7 +246,7 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest * * @throws Exception */ - public void testFootnotesLimitParameterUsing() throws Exception + public void testFootnotesLimitParameterUsingDefault() throws Exception { PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter(); @@ -256,23 +257,42 @@ public class PoiMetadataExtracterTest extends AbstractMetadataExtracterTest Map properties = new HashMap(); long startTime = System.currentTimeMillis(); extractor.extract(sourceReader, properties); - long extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime; + extractionTimeWithDefaultFootnotesLimit = System.currentTimeMillis() - startTime; assertExtractedProperties(properties); - assertFalse("Reader was not closed", sourceReader.isChannelOpen()); + if (extractionTimeWithLargeFootnotesLimit != null) + { + assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit); + } + } + + + /** + * Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for Word office document + * + * @throws Exception + */ + public void testFootnotesLimitParameterUsingLarge() throws Exception + { + PoiMetadataExtracter extractor = (PoiMetadataExtracter) getExtracter(); + + File sourceFile = AbstractContentTransformerTest.loadNamedQuickTestFile(PROBLEM_FOOTNOTES_DOCUMENT_NAME); + ContentReader sourceReader = new FileContentReader(sourceFile); + sourceReader.setMimetype(MimetypeMap.MIMETYPE_OPENXML_WORDPROCESSING); // Just let the extractor do the job... - configureExtractorLimits(extractor, ALL_MIMETYPES_FILTER, IGNORABLE_TIMEOUT); extractor.setPoiFootnotesLimit(LARGE_FOOTNOTES_LIMIT); extractor.afterPropertiesSet(); - properties = new HashMap(); - startTime = System.currentTimeMillis(); + Map properties = new HashMap(); + long startTime = System.currentTimeMillis(); extractor.extract(sourceReader, properties); - long extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime; + extractionTimeWithLargeFootnotesLimit = System.currentTimeMillis() - startTime; assertExtractedProperties(properties); - assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit); - assertFalse("Reader was not closed", sourceReader.isChannelOpen()); + if (extractionTimeWithDefaultFootnotesLimit != null) + { + assertTrue("The second metadata extraction operation must be longer!", extractionTimeWithLargeFootnotesLimit > extractionTimeWithDefaultFootnotesLimit); + } } /**