Merged 5.2.N (5.2.1) to HEAD (5.2)

126004 aleahu: Merged 5.1.N (5.1.2) to 5.2.N (5.2.1)
      125892 adragoi: Merged 5.0.N (5.0.4) to 5.1.N (5.1.2)
         125842 rmunteanu: Merged V4.2-BUG-FIX (4.2.7) to 5.0.N (5.0.4) (PARTIAL MERGE)
            125700 adavis: Merged V4.2.5 (4.2.5.7) to V4.2-BUG-FIX (4.2.7)
               125698: Merged DEV to V4.2.5 (4.2.5.7)
                  125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika
                     - Should not have updated version.properties as the original commit needs to be merged forwards.,
               125696: Merged DEV to V4.2.5 (4.2.5.7)
                  125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika
                     - Modified tika parser and tika core jars to allow some configuration parameters to be sent from Alfresco side using the metadata map parameter
                     - Excluded by default the parsing of drawings/shapes xmls because there was little valuable data that could be extracted from those xmls


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@127835 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Alan Davis
2016-06-06 08:34:29 +00:00
parent 640535fd09
commit ce5a675361
13 changed files with 254 additions and 9 deletions

View File

@@ -241,6 +241,13 @@
</property>
</bean>
<!-- Metadata Extraction Configuration -->
<bean id="metadataExtracterConfig" class="org.alfresco.repo.content.metadata.MetadataExtracterConfigImpl" >
<property name="properties">
<ref bean="global-properties" />
</property>
</bean>
<!-- Metadata Extraction Registry -->
<bean id="metadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" />
@@ -261,6 +268,9 @@
<property name="properties">
<ref bean="global-properties" />
</property>
<property name="metadataExtracterConfig">
<ref bean="metadataExtracterConfig" />
</property>
</bean>
<!-- For coordination between OOoDirect and OOoJodconverter subsystems -->
@@ -466,6 +476,9 @@
<value>true</value>
</property>
<property name="retryTransformOnDifferentMimeType" value="${content.transformer.retryOn.different.mimetype}"/>
<property name="metadataExtracterConfig">
<ref bean="metadataExtracterConfig" />
</property>
</bean>
<!-- Abstract bean definition defining base definition for all transformer that are not registered.

View File

@@ -175,6 +175,9 @@
<property name="tikaConfig">
<ref bean="tikaConfig"/>
</property>
<property name="metadataExtracterConfig">
<ref bean="metadataExtracterConfig" />
</property>
</bean>
<bean id="compositeRenderingEngine"

View File

@@ -650,6 +650,9 @@ content.transformer.Poi.poiExtractPropertiesOnly=true
# The default timeout for metadata mapping extracters
content.metadataExtracter.default.timeoutMs=20000
# Indicates if the metadata extracter should parse shape objects inside open office files
content.metadataExtracter.parseShapes=false
# Property to enable upgrade from 2.1-A
V2.1-A.fixes.to.schema=0
#V2.1-A.fixes.to.schema=82

Binary file not shown.

View File

@@ -148,6 +148,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private Properties properties;
private Map<String, MetadataExtracterLimits> mimetypeLimits;
private ExecutorService executorService;
protected MetadataExtracterConfig metadataExtracterConfig;
/**
* Default constructor. If this is called, then {@link #isSupported(String)} should
@@ -418,6 +419,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
this.properties = properties;
}
/**
* The metadata extracter config.
*/
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
{
this.metadataExtracterConfig = metadataExtracterConfig;
}
/**
* Whether or not to enable the pass through of simple strings to cm:taggable tags
*

View File

@@ -78,6 +78,7 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter, Be
private long extractionTime;
private String beanName;
private Properties properties;
private MetadataExtracterConfig metadataExtracterConfig;
protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime)
{
@@ -132,6 +133,14 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter, Be
this.properties = properties;
}
/**
* The metadata extracter config.
*/
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
{
this.metadataExtracterConfig = metadataExtracterConfig;
}
/**
* @return Returns the mimetype helper
*/

View File

@@ -0,0 +1,43 @@
/*
* Copyright (C) 2005-2016 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import org.apache.tika.metadata.Metadata;
/**
* Interface for allowing Alfresco to provide configuration properties to metadata extracters like
* Tika parser;
*
* @author Andrei Rebegea
*/
public interface MetadataExtracterConfig
{
// Users can specify override values in alfreso-global.properties
public static final boolean TIKA_PARSER_PARSE_SHAPES_DEFAULT_VALUE = false;
/**
* This method adds appropriate configuration values in the metadata map that will be used in
* Tika to control some of the features Tika has;
*
* @param metadata
* input/output map used by Tika to pass information about the file
*/
void prepareMetadataWithConfigParams(Metadata metadata);
}

View File

@@ -0,0 +1,78 @@
/*
* Copyright (C) 2005-2016 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.metadata;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
/**
* Default implementation for the MetadataExtracterConfig;
*
* @author Andrei Rebegea
*/
public class MetadataExtracterConfigImpl implements MetadataExtracterConfig
{
protected static Log logger = LogFactory.getLog(MetadataExtracterConfigImpl.class);
private static final String PARSE_SHAPE_PROP_STRING = "content.metadataExtracter.parseShapes";
private Properties properties;
@Override
public void prepareMetadataWithConfigParams(Metadata metadata)
{
if (metadata == null)
{
return;
}
boolean shouldParseShapes = getBooleanProperty(PARSE_SHAPE_PROP_STRING, TIKA_PARSER_PARSE_SHAPES_DEFAULT_VALUE);
metadata.add(TikaMetadataKeys.TIKA_PARSER_PARSE_SHAPES_KEY, Boolean.toString(shouldParseShapes));
if (logger.isDebugEnabled())
{
logger.debug("Tika metadata options passed to tika parser: " + metadata);
}
}
/**
* The Alfresco global properties.
*/
public void setProperties(Properties properties)
{
this.properties = properties;
}
private boolean getBooleanProperty(String name, boolean defaultValue)
{
boolean value = defaultValue;
if (properties != null)
{
String property = properties.getProperty(name);
if (property != null)
{
value = property.trim().equalsIgnoreCase(Boolean.TRUE.toString());
}
}
return value;
}
}

View File

@@ -379,6 +379,10 @@ public abstract class TikaPoweredMetadataExtracter
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype());
if (metadataExtracterConfig != null)
{
metadataExtracterConfig.prepareMetadataWithConfigParams(metadata);
}
ParseContext context = buildParseContext(metadata, reader.getMimetype());

View File

@@ -40,6 +40,7 @@ import org.alfresco.repo.content.AbstractStreamAwareProxy;
import org.alfresco.repo.content.StreamAwareContentReaderProxy;
import org.alfresco.repo.content.StreamAwareContentWriterProxy;
import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
import org.alfresco.repo.content.metadata.MetadataExtracterConfig;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentServiceTransientException;
@@ -69,7 +70,7 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
private ContentTransformerRegistry registry;
private boolean registerTransformer;
private boolean retryTransformOnDifferentMimeType;
MetadataExtracterConfig metadataExtracterConfig;
/**
* A flag that indicates that the transformer should be started in it own Thread so
* that it may be interrupted rather than using the timeout in the Reader.
@@ -112,6 +113,14 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
this.registry = registry;
}
/**
* The metadata extracter config.
*/
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
{
this.metadataExtracterConfig = metadataExtracterConfig;
}
/**
* @param registerTransformer as been available for selection.
* If {@code false} this indicates that the transformer may only be

View File

@@ -227,6 +227,10 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
Parser parser = getParser();
Metadata metadata = new Metadata();
if (metadataExtracterConfig != null)
{
metadataExtracterConfig.prepareMetadataWithConfigParams(metadata);
}
ParseContext context = buildParseContext(metadata, targetMimeType, options);

View File

@@ -45,6 +45,7 @@ import javax.xml.transform.stream.StreamResult;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.ParameterDefinitionImpl;
import org.alfresco.repo.content.metadata.MetadataExtracterConfig;
import org.alfresco.repo.rendition.RenditionLocation;
import org.alfresco.service.cmr.action.ParameterDefinition;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
@@ -88,7 +89,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
{
private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class);
private TikaConfig tikaConfig;
private MetadataExtracterConfig metadataExtracterConfig;
/**
* This optional parameter, when set to true, causes only the
* contents of the HTML body to be written out as the rendition.
@@ -129,6 +130,14 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
this.tikaConfig = tikaConfig;
}
/**
* The metadata extracter config.
*/
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
{
this.metadataExtracterConfig = metadataExtracterConfig;
}
/*
* (non-Javadoc)
* @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
@@ -350,6 +359,10 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
ContentModel.PROP_NAME
).toString()
);
if (metadataExtracterConfig != null)
{
metadataExtracterConfig.prepareMetadataWithConfigParams(metadata);
}
// Our parse context needs to extract images
ParseContext parseContext = new ParseContext();

View File

@@ -78,6 +78,8 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
extracter = new TikaAutoMetadataExtracter(config);
extracter.setDictionaryService(dictionaryService);
MetadataExtracterConfig metadataExtracterConfig = (MetadataExtracterConfig)ctx.getBean("metadataExtracterConfig");
extracter.setMetadataExtracterConfig(metadataExtracterConfig);
extracter.register();
// Attach some extra mappings, using the Tika
@@ -172,6 +174,61 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
}
}
/**
* Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
* cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
*/
public void testParsingOfShapesInXLSXFiles() throws Exception
{
AutoDetectParser ap = new AutoDetectParser();
String filename = "dmsu1332-reproduced.xlsx";
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
File file = new File(url.getFile());
// Cheat and ask Tika for the mime type!
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
String mimetype = mt.toString();
if (logger.isDebugEnabled())
{
logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
}
// Have it processed
// Note that if the patched/fix from MNT-15219 is not applied,
// or if the default false value of the content.metadataExtracter.parseShapes property is overridden
// then the next call will throw an OutOfMemory that is dealt with by the tika metadata extracter framework
// and it will fail at the next assert because properties extracted will be empty
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
// check we got something
assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename,
properties.isEmpty());
if (properties.containsKey(ContentModel.PROP_AUTHOR))
{
assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
"Udintsev, Anton (external - Project)",
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
}
else
{
fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
+ mimetype);
}
// Ensure that we can also get things which are standard
// Tika metadata properties, if we so choose to
assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
mimetype,
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
}
@Override
protected boolean skipAuthorCheck(String mimetype) { return true; }