mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-10-15 15:02:20 +00:00
Merged 5.2.N (5.2.1) to HEAD (5.2)
126004 aleahu: Merged 5.1.N (5.1.2) to 5.2.N (5.2.1) 125892 adragoi: Merged 5.0.N (5.0.4) to 5.1.N (5.1.2) 125842 rmunteanu: Merged V4.2-BUG-FIX (4.2.7) to 5.0.N (5.0.4) (PARTIAL MERGE) 125700 adavis: Merged V4.2.5 (4.2.5.7) to V4.2-BUG-FIX (4.2.7) 125698: Merged DEV to V4.2.5 (4.2.5.7) 125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika - Should not have updated version.properties as the original commit needs to be merged forwards., 125696: Merged DEV to V4.2.5 (4.2.5.7) 125677 arebegea: MNT-15219 : Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may cause OutOfMemory in Tika - Modified tika parser and tika core jars to allow some configuration parameters to be sent from Alfresco side using the metadata map parameter - Excluded by default the parsing of drawings/shapes xmls because there was little valuable data that could be extracted from those xmls git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@127835 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -241,6 +241,13 @@
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- Metadata Extraction Configuration -->
|
||||
<bean id="metadataExtracterConfig" class="org.alfresco.repo.content.metadata.MetadataExtracterConfigImpl" >
|
||||
<property name="properties">
|
||||
<ref bean="global-properties" />
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- Metadata Extraction Registry -->
|
||||
<bean id="metadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" />
|
||||
|
||||
@@ -261,6 +268,9 @@
|
||||
<property name="properties">
|
||||
<ref bean="global-properties" />
|
||||
</property>
|
||||
<property name="metadataExtracterConfig">
|
||||
<ref bean="metadataExtracterConfig" />
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- For coordination between OOoDirect and OOoJodconverter subsystems -->
|
||||
@@ -466,6 +476,9 @@
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property name="retryTransformOnDifferentMimeType" value="${content.transformer.retryOn.different.mimetype}"/>
|
||||
<property name="metadataExtracterConfig">
|
||||
<ref bean="metadataExtracterConfig" />
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- Abstract bean definition defining base definition for all transformer that are not registered.
|
||||
|
@@ -175,6 +175,9 @@
|
||||
<property name="tikaConfig">
|
||||
<ref bean="tikaConfig"/>
|
||||
</property>
|
||||
<property name="metadataExtracterConfig">
|
||||
<ref bean="metadataExtracterConfig" />
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<bean id="compositeRenderingEngine"
|
||||
|
@@ -650,6 +650,9 @@ content.transformer.Poi.poiExtractPropertiesOnly=true
|
||||
# The default timeout for metadata mapping extracters
|
||||
content.metadataExtracter.default.timeoutMs=20000
|
||||
|
||||
# Indicates if the metadata extracter should parse shape objects inside open office files
|
||||
content.metadataExtracter.parseShapes=false
|
||||
|
||||
# Property to enable upgrade from 2.1-A
|
||||
V2.1-A.fixes.to.schema=0
|
||||
#V2.1-A.fixes.to.schema=82
|
||||
|
BIN
config/quick/dmsu1332-reproduced.xlsx
Normal file
BIN
config/quick/dmsu1332-reproduced.xlsx
Normal file
Binary file not shown.
@@ -148,6 +148,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
private Properties properties;
|
||||
private Map<String, MetadataExtracterLimits> mimetypeLimits;
|
||||
private ExecutorService executorService;
|
||||
protected MetadataExtracterConfig metadataExtracterConfig;
|
||||
|
||||
/**
|
||||
* Default constructor. If this is called, then {@link #isSupported(String)} should
|
||||
@@ -418,6 +419,14 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* The metadata extracter config.
|
||||
*/
|
||||
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
|
||||
{
|
||||
this.metadataExtracterConfig = metadataExtracterConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not to enable the pass through of simple strings to cm:taggable tags
|
||||
*
|
||||
|
@@ -78,6 +78,7 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter, Be
|
||||
private long extractionTime;
|
||||
private String beanName;
|
||||
private Properties properties;
|
||||
private MetadataExtracterConfig metadataExtracterConfig;
|
||||
|
||||
protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime)
|
||||
{
|
||||
@@ -132,6 +133,14 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter, Be
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* The metadata extracter config.
|
||||
*/
|
||||
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
|
||||
{
|
||||
this.metadataExtracterConfig = metadataExtracterConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the mimetype helper
|
||||
*/
|
||||
|
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2016 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
/**
|
||||
* Interface for allowing Alfresco to provide configuration properties to metadata extracters like
|
||||
* Tika parser;
|
||||
*
|
||||
* @author Andrei Rebegea
|
||||
*/
|
||||
public interface MetadataExtracterConfig
|
||||
{
|
||||
|
||||
// Users can specify override values in alfreso-global.properties
|
||||
public static final boolean TIKA_PARSER_PARSE_SHAPES_DEFAULT_VALUE = false;
|
||||
|
||||
/**
|
||||
* This method adds appropriate configuration values in the metadata map that will be used in
|
||||
* Tika to control some of the features Tika has;
|
||||
*
|
||||
* @param metadata
|
||||
* input/output map used by Tika to pass information about the file
|
||||
*/
|
||||
void prepareMetadataWithConfigParams(Metadata metadata);
|
||||
}
|
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2016 Alfresco Software Limited.
|
||||
*
|
||||
* This file is part of Alfresco
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
|
||||
/**
|
||||
* Default implementation for the MetadataExtracterConfig;
|
||||
*
|
||||
* @author Andrei Rebegea
|
||||
*/
|
||||
public class MetadataExtracterConfigImpl implements MetadataExtracterConfig
|
||||
{
|
||||
protected static Log logger = LogFactory.getLog(MetadataExtracterConfigImpl.class);
|
||||
|
||||
private static final String PARSE_SHAPE_PROP_STRING = "content.metadataExtracter.parseShapes";
|
||||
|
||||
private Properties properties;
|
||||
|
||||
@Override
|
||||
public void prepareMetadataWithConfigParams(Metadata metadata)
|
||||
{
|
||||
if (metadata == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
boolean shouldParseShapes = getBooleanProperty(PARSE_SHAPE_PROP_STRING, TIKA_PARSER_PARSE_SHAPES_DEFAULT_VALUE);
|
||||
metadata.add(TikaMetadataKeys.TIKA_PARSER_PARSE_SHAPES_KEY, Boolean.toString(shouldParseShapes));
|
||||
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Tika metadata options passed to tika parser: " + metadata);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The Alfresco global properties.
|
||||
*/
|
||||
public void setProperties(Properties properties)
|
||||
{
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
private boolean getBooleanProperty(String name, boolean defaultValue)
|
||||
{
|
||||
boolean value = defaultValue;
|
||||
if (properties != null)
|
||||
{
|
||||
String property = properties.getProperty(name);
|
||||
if (property != null)
|
||||
{
|
||||
value = property.trim().equalsIgnoreCase(Boolean.TRUE.toString());
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
@@ -379,6 +379,10 @@ public abstract class TikaPoweredMetadataExtracter
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
metadata.add(Metadata.CONTENT_TYPE, reader.getMimetype());
|
||||
if (metadataExtracterConfig != null)
|
||||
{
|
||||
metadataExtracterConfig.prepareMetadataWithConfigParams(metadata);
|
||||
}
|
||||
|
||||
ParseContext context = buildParseContext(metadata, reader.getMimetype());
|
||||
|
||||
|
@@ -40,6 +40,7 @@ import org.alfresco.repo.content.AbstractStreamAwareProxy;
|
||||
import org.alfresco.repo.content.StreamAwareContentReaderProxy;
|
||||
import org.alfresco.repo.content.StreamAwareContentWriterProxy;
|
||||
import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
|
||||
import org.alfresco.repo.content.metadata.MetadataExtracterConfig;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentServiceTransientException;
|
||||
@@ -69,7 +70,7 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
|
||||
private ContentTransformerRegistry registry;
|
||||
private boolean registerTransformer;
|
||||
private boolean retryTransformOnDifferentMimeType;
|
||||
|
||||
MetadataExtracterConfig metadataExtracterConfig;
|
||||
/**
|
||||
* A flag that indicates that the transformer should be started in it own Thread so
|
||||
* that it may be interrupted rather than using the timeout in the Reader.
|
||||
@@ -112,6 +113,14 @@ public abstract class AbstractContentTransformer2 extends AbstractContentTransfo
|
||||
this.registry = registry;
|
||||
}
|
||||
|
||||
/**
|
||||
* The metadata extracter config.
|
||||
*/
|
||||
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
|
||||
{
|
||||
this.metadataExtracterConfig = metadataExtracterConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param registerTransformer as been available for selection.
|
||||
* If {@code false} this indicates that the transformer may only be
|
||||
|
@@ -227,6 +227,10 @@ public abstract class TikaPoweredContentTransformer extends AbstractContentTrans
|
||||
|
||||
Parser parser = getParser();
|
||||
Metadata metadata = new Metadata();
|
||||
if (metadataExtracterConfig != null)
|
||||
{
|
||||
metadataExtracterConfig.prepareMetadataWithConfigParams(metadata);
|
||||
}
|
||||
|
||||
ParseContext context = buildParseContext(metadata, targetMimeType, options);
|
||||
|
||||
|
@@ -45,6 +45,7 @@ import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.action.ParameterDefinitionImpl;
|
||||
import org.alfresco.repo.content.metadata.MetadataExtracterConfig;
|
||||
import org.alfresco.repo.rendition.RenditionLocation;
|
||||
import org.alfresco.service.cmr.action.ParameterDefinition;
|
||||
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
|
||||
@@ -88,7 +89,7 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
||||
{
|
||||
private static Log logger = LogFactory.getLog(HTMLRenderingEngine.class);
|
||||
private TikaConfig tikaConfig;
|
||||
|
||||
private MetadataExtracterConfig metadataExtracterConfig;
|
||||
/**
|
||||
* This optional parameter, when set to true, causes only the
|
||||
* contents of the HTML body to be written out as the rendition.
|
||||
@@ -129,6 +130,14 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
||||
this.tikaConfig = tikaConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* The metadata extracter config.
|
||||
*/
|
||||
public void setMetadataExtracterConfig(MetadataExtracterConfig metadataExtracterConfig)
|
||||
{
|
||||
this.metadataExtracterConfig = metadataExtracterConfig;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
|
||||
@@ -350,6 +359,10 @@ public class HTMLRenderingEngine extends AbstractRenderingEngine
|
||||
ContentModel.PROP_NAME
|
||||
).toString()
|
||||
);
|
||||
if (metadataExtracterConfig != null)
|
||||
{
|
||||
metadataExtracterConfig.prepareMetadataWithConfigParams(metadata);
|
||||
}
|
||||
|
||||
// Our parse context needs to extract images
|
||||
ParseContext parseContext = new ParseContext();
|
||||
|
@@ -78,6 +78,8 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
|
||||
extracter = new TikaAutoMetadataExtracter(config);
|
||||
extracter.setDictionaryService(dictionaryService);
|
||||
MetadataExtracterConfig metadataExtracterConfig = (MetadataExtracterConfig)ctx.getBean("metadataExtracterConfig");
|
||||
extracter.setMetadataExtracterConfig(metadataExtracterConfig);
|
||||
extracter.register();
|
||||
|
||||
// Attach some extra mappings, using the Tika
|
||||
@@ -172,6 +174,61 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
|
||||
* cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
|
||||
*/
|
||||
public void testParsingOfShapesInXLSXFiles() throws Exception
|
||||
{
|
||||
AutoDetectParser ap = new AutoDetectParser();
|
||||
|
||||
String filename = "dmsu1332-reproduced.xlsx";
|
||||
URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
|
||||
File file = new File(url.getFile());
|
||||
|
||||
// Cheat and ask Tika for the mime type!
|
||||
Metadata metadata = new Metadata();
|
||||
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
|
||||
MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
|
||||
String mimetype = mt.toString();
|
||||
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
|
||||
}
|
||||
|
||||
// Have it processed
|
||||
// Note that if the patched/fix from MNT-15219 is not applied,
|
||||
// or if the default false value of the content.metadataExtracter.parseShapes property is overridden
|
||||
// then the next call will throw an OutOfMemory that is dealt with by the tika metadata extracter framework
|
||||
// and it will fail at the next assert because properties extracted will be empty
|
||||
Map<QName, Serializable> properties = extractFromFile(file, mimetype);
|
||||
|
||||
// check we got something
|
||||
assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename,
|
||||
properties.isEmpty());
|
||||
|
||||
if (properties.containsKey(ContentModel.PROP_AUTHOR))
|
||||
{
|
||||
assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype,
|
||||
"Udintsev, Anton (external - Project)",
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
|
||||
}
|
||||
else
|
||||
{
|
||||
fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
|
||||
+ mimetype);
|
||||
}
|
||||
|
||||
// Ensure that we can also get things which are standard
|
||||
// Tika metadata properties, if we so choose to
|
||||
assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype,
|
||||
properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
|
||||
assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype,
|
||||
mimetype,
|
||||
DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean skipAuthorCheck(String mimetype) { return true; }
|
||||
|
||||
|
Reference in New Issue
Block a user