XML metadata extraction with sample.

Added tests into build.

This is now ready for testing, comments and suggestions.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@6056 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2007-06-21 16:09:03 +00:00
parent 757616bc85
commit 55a6e2f287
10 changed files with 789 additions and 53 deletions

View File

@@ -93,7 +93,6 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private MetadataExtracterRegistry registry;
private MimetypeService mimetypeService;
private long extractionTime;
private boolean initialized;
private Set<String> supportedMimetypes;
@@ -101,12 +100,23 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
private Map<String, Set<QName>> mapping;
private boolean inheritDefaultMapping;
/**
* Default constructor. If this is called, then {@link #isSupported(String)} should
* be implemented. This is useful when the list of supported mimetypes is not known
* when the instance is constructed. Alternatively, once the set becomes known, call
* {@link #setSupportedMimetypes(Collection)}.
*
* @see #isSupported(String)
* @see #setSupportedMimetypes(Collection)
*/
protected AbstractMappingMetadataExtracter()
{
this(Collections.<String>emptySet());
}
/**
* Constructor that can be used when the list of supported mimetypes is known up front.
*
* @param supportedMimetypes the set of mimetypes supported by default
*/
protected AbstractMappingMetadataExtracter(Set<String> supportedMimetypes)
@@ -179,13 +189,29 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
}
/**
* @param overwritePolicy the policy to apply when there are existing system properties
* Set the policy to use when existing values are encountered. Depending on how the extracer
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy.
*
* @param overwritePolicy the policy to apply when there are existing system properties
*/
public void setOverwritePolicy(OverwritePolicy overwritePolicy)
{
this.overwritePolicy = overwritePolicy;
}
/**
* Set the policy to use when existing values are encountered. Depending on how the extracer
* is called, this may not be relevant, i.e an empty map of existing properties may be passed
* in by the client code, which may follow its own overwrite strategy.
*
* @param overwritePolicyStr the policy to apply when there are existing system properties
*/
public void setOverwritePolicy(String overwritePolicyStr)
{
this.overwritePolicy = OverwritePolicy.valueOf(overwritePolicyStr);
}
/**
* Set if the property mappings augment or override the mapping generically provided by the
* extracter implementation. The default is <tt>false</tt>, i.e. any mapping set completely
@@ -410,10 +436,6 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
{
registry.register(this);
}
else
{
logger.warn("No registry provided. Not registering: " + this);
}
}
/**
@@ -466,7 +488,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
/** {@inheritDoc} */
public long getExtractionTime()
{
return extractionTime;
return 1000L;
}
/**
@@ -510,7 +532,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
/**
* {@inheritDoc}
*/
public final Map<QName, Serializable> extract(
public Map<QName, Serializable> extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination,

View File

@@ -222,6 +222,8 @@ public interface MetadataExtracter extends ContentWorker
* reliant transformers will be used for a specific extraction.
*
* @return Returns the approximate number of milliseconds per transformation
*
* @deprecated Generally not useful or used. Extraction is normally specifically configured.
*/
public long getExtractionTime();

View File

@@ -145,7 +145,6 @@ public class MetadataExtracterRegistry
*/
private MetadataExtracter findBestExtracter(String sourceMimetype)
{
long bestTime = Long.MAX_VALUE;
logger.debug("Finding best extracter for " + sourceMimetype);
MetadataExtracter bestExtracter = null;
@@ -157,12 +156,7 @@ public class MetadataExtracterRegistry
// extraction not achievable
continue;
}
long time = ext.getExtractionTime();
if (time < bestTime)
{
bestExtracter = ext;
bestTime = time;
}
bestExtracter = ext;
}
return bestExtracter;
}

View File

@@ -0,0 +1,191 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata.xml;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.alfresco.repo.content.selector.ContentWorkerSelector;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
import org.alfresco.repo.content.metadata.MetadataExtracter;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.alfresco.util.PropertyCheck;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* A metadata extractor that selects an appropiate workder for the extraction.
* <p>
* The {@linkplain #setSelectors(List) selectors} are used to find an extracter most
* appropriate of a given XML document. The chosen extracter is then asked to extract
* the values, passing through the {@linkplain MetadataExtracter.OverwritePolicy overwrite policy}
* as {@linkplain #setOverwritePolicy(String)} on this instance. The overwrite policy of the
* embedded extracters is not relevant unless they are used separately in another context.
*
* @see ContentWorkerSelector
* @see MetadataExtracter
*
* @since 2.1
* @author Derek Hulley
*/
public class XmlMetadataExtracter extends AbstractMappingMetadataExtracter
{
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_XML};
private static Log logger = LogFactory.getLog(XPathMetadataExtracter.class);
private List<ContentWorkerSelector<MetadataExtracter>> selectors;
/**
* Default constructor
*/
public XmlMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
}
/**
* Sets the list of metadata selectors to use to find the extracter to use, given
* some content. The evaluations are done in the order that they occur in the
* list.
*
* @param selectors A list of selectors
*/
public void setSelectors(List<ContentWorkerSelector<MetadataExtracter>> selectors)
{
this.selectors = selectors;
}
@Override
protected void init()
{
PropertyCheck.mandatory(this, "selectors", selectors);
// Get the base class to set up its mappings
super.init();
}
/**
* It is not possible to have any default mappings, but something has to be returned.
*
* @return Returns an empty map
*/
@Override
protected Map<String, Set<QName>> getDefaultMapping()
{
return Collections.emptyMap();
}
/**
* Selects and extracter to perform the work and redirects to it.
*/
@Override
public Map<QName, Serializable> extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination,
Map<String, Set<QName>> mapping)
{
MetadataExtracter extracter = null;
// Select a worker
for (ContentWorkerSelector<MetadataExtracter> selector : selectors)
{
ContentReader spawnedReader = reader.getReader();
try
{
extracter = selector.getWorker(spawnedReader);
}
finally
{
if (reader.isChannelOpen())
{
logger.error("Content reader not closed by MetadataExtractor selector: \n" +
" reader: " + reader + "\n" +
" selector: " + selector);
}
}
// Just take the first successful one
if (extracter != null)
{
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Found metadata extracter to process XML document: \n" +
" Selector: " + selector + "\n" +
" Document: " + reader);
}
break;
}
}
Map<QName, Serializable> modifiedProperties = null;
// Did we find anything?
if (extracter == null)
{
// There will be no properties extracted
modifiedProperties = Collections.emptyMap();
}
else
{
// An extractor was selected
try
{
modifiedProperties = extracter.extract(reader, overwritePolicy, destination, mapping);
}
finally
{
if (reader.isChannelOpen())
{
logger.error("Content reader not closed by MetadataExtractor: \n" +
" Reader: " + reader + "\n" +
" extracter: " + extracter);
}
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"XML metadata extractor redirected: \n" +
" Reader: " + reader + "\n" +
" Extracter: " + extracter + "\n" +
" Extracted: " + modifiedProperties);
}
return modifiedProperties;
}
/**
* This is not required as the
*/
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
throw new UnsupportedOperationException();
}
}

View File

@@ -31,10 +31,27 @@ import java.net.URL;
import junit.framework.TestCase;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.ActionImpl;
import org.alfresco.repo.action.executer.ActionExecuter;
import org.alfresco.repo.action.executer.SetPropertyValueActionExecuter;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.metadata.MetadataExtracter;
import org.alfresco.repo.content.selector.RootElementNameContentWorkerSelector;
import org.alfresco.repo.content.selector.XPathContentWorkerSelector;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.repo.security.authentication.AuthenticationComponent;
import org.alfresco.service.ServiceRegistry;
import org.alfresco.service.cmr.action.Action;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentService;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.NodeService;
import org.alfresco.service.cmr.repository.StoreRef;
import org.alfresco.service.namespace.NamespaceService;
import org.alfresco.service.namespace.QName;
import org.alfresco.util.GUID;
import org.alfresco.util.PropertyMap;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
@@ -54,8 +71,13 @@ public class XmlMetadataExtracterTest extends TestCase
private static final String CTX_LOCATION = "classpath:xml-metadata/xml-metadata-test-context.xml";
private static final ApplicationContext ctx = new ClassPathXmlApplicationContext(CTX_LOCATION);
private XPathMetadataExtracter alfrescoModelMetadataExtractor;
private XPathMetadataExtracter eclipseProjectMetadataExtractor;
private ServiceRegistry serviceRegistry;
private AuthenticationComponent authenticationComponent;
private XPathMetadataExtracter alfrescoModelMetadataExtracter;
private XPathMetadataExtracter eclipseProjectMetadataExtracter;
private RootElementNameContentWorkerSelector<MetadataExtracter> rootElementNameMetadataExtracterSelector;
private XPathContentWorkerSelector<MetadataExtracter> xpathMetadataExtracterSelector;
private XmlMetadataExtracter xmlMetadataExtracter;
/**
* Get a reader for a file that should be on the classpath.
@@ -78,16 +100,30 @@ public class XmlMetadataExtracterTest extends TestCase
}
@Override
@SuppressWarnings("unchecked")
public void setUp() throws Exception
{
alfrescoModelMetadataExtractor = (XPathMetadataExtracter) ctx.getBean("extracter.xml.AlfrescoModelMetadataExtracter");
eclipseProjectMetadataExtractor = (XPathMetadataExtracter) ctx.getBean("extracter.xml.EclipseProjectMetadataExtracter");
serviceRegistry = (ServiceRegistry) ctx.getBean(ServiceRegistry.SERVICE_REGISTRY);
authenticationComponent = (AuthenticationComponent) ctx.getBean("authenticationComponent");
alfrescoModelMetadataExtracter = (XPathMetadataExtracter) ctx.getBean("extracter.xml.AlfrescoModelMetadataExtracter");
eclipseProjectMetadataExtracter = (XPathMetadataExtracter) ctx.getBean("extracter.xml.EclipseProjectMetadataExtracter");
rootElementNameMetadataExtracterSelector = (RootElementNameContentWorkerSelector<MetadataExtracter>) ctx.getBean("extracter.xml.selector.RootElementSelector");
xpathMetadataExtracterSelector = (XPathContentWorkerSelector<MetadataExtracter>) ctx.getBean("extracter.xml.selector.XPathSelector");
xmlMetadataExtracter = (XmlMetadataExtracter) ctx.getBean("extracter.xml.XMLMetadataExtracter");
authenticationComponent.setSystemUserAsCurrentUser();
}
@Override
public void tearDown() throws Exception
{
try { authenticationComponent.clearCurrentSecurityContext(); } catch (Throwable e) {}
}
public void testSetUp()
{
assertNotNull(alfrescoModelMetadataExtractor);
assertNotNull(eclipseProjectMetadataExtractor);
assertNotNull(alfrescoModelMetadataExtracter);
assertNotNull(eclipseProjectMetadataExtracter);
}
public void testExtractAlfresocModel() throws Exception
@@ -98,7 +134,7 @@ public class XmlMetadataExtracterTest extends TestCase
// Pass it to the extracter
PropertyMap checkProperties = new PropertyMap();
alfrescoModelMetadataExtractor.extract(reader, checkProperties);
alfrescoModelMetadataExtracter.extract(reader, checkProperties);
// Check the values
assertEquals("Gavin Cornwell", checkProperties.get(ContentModel.PROP_AUTHOR));
@@ -114,10 +150,128 @@ public class XmlMetadataExtracterTest extends TestCase
// Pass it to the extracter
PropertyMap checkProperties = new PropertyMap();
eclipseProjectMetadataExtractor.extract(reader, checkProperties);
eclipseProjectMetadataExtracter.extract(reader, checkProperties);
// Check the values
assertEquals("Repository", checkProperties.get(ContentModel.PROP_TITLE));
assertEquals("JavaCC Nature", checkProperties.get(ContentModel.PROP_DESCRIPTION));
}
public void testRootElementNameSelector() throws Exception
{
// Load the example files
ContentReader alfrescoModelReader = getReader(FILE_ALFRESCO_MODEL);
assertTrue(alfrescoModelReader.exists());
ContentReader eclipseProjectReader = getReader(FILE_ECLIPSE_PROJECT);
assertTrue(eclipseProjectReader.exists());
// Check with an alfresco model document
MetadataExtracter alfrescoModelExtracter = rootElementNameMetadataExtracterSelector.getWorker(alfrescoModelReader);
assertNotNull("Failed to select correct extracter", alfrescoModelExtracter);
assertTrue("Incorrect extracter instance selected", alfrescoModelMetadataExtracter == alfrescoModelExtracter);
assertFalse("Read channel not closed", alfrescoModelReader.isChannelOpen());
// Check with an eclipse project document
MetadataExtracter eclipseProjectExtracter = rootElementNameMetadataExtracterSelector.getWorker(eclipseProjectReader);
assertNotNull("Failed to select correct extracter", eclipseProjectExtracter);
assertTrue("Incorrect extracter instance selected", eclipseProjectMetadataExtracter == eclipseProjectExtracter);
assertFalse("Read channel not closed", eclipseProjectReader.isChannelOpen());
}
public void testXpathSelector() throws Exception
{
// Load the example files
ContentReader alfrescoModelReader = getReader(FILE_ALFRESCO_MODEL);
assertTrue(alfrescoModelReader.exists());
ContentReader eclipseProjectReader = getReader(FILE_ECLIPSE_PROJECT);
assertTrue(eclipseProjectReader.exists());
// Check with an alfresco model document
MetadataExtracter alfrescoModelExtracter = xpathMetadataExtracterSelector.getWorker(alfrescoModelReader);
assertNotNull("Failed to select correct extracter", alfrescoModelExtracter);
assertTrue("Incorrect extracter instance selected", alfrescoModelMetadataExtracter == alfrescoModelExtracter);
assertFalse("Read channel not closed", alfrescoModelReader.isChannelOpen());
// Check with an eclipse project document
MetadataExtracter eclipseProjectExtracter = xpathMetadataExtracterSelector.getWorker(eclipseProjectReader);
assertNotNull("Failed to select correct extracter", eclipseProjectExtracter);
assertTrue("Incorrect extracter instance selected", eclipseProjectMetadataExtracter == eclipseProjectExtracter);
assertFalse("Read channel not closed", eclipseProjectReader.isChannelOpen());
}
public void testXmlMetadataExtracter() throws Exception
{
// Load the example files
ContentReader alfrescoModelReader = getReader(FILE_ALFRESCO_MODEL);
assertTrue(alfrescoModelReader.exists());
ContentReader eclipseProjectReader = getReader(FILE_ECLIPSE_PROJECT);
assertTrue(eclipseProjectReader.exists());
// Pass the Alfresco Model xml to the extractor
PropertyMap checkAlfrescoModelProperties = new PropertyMap();
xmlMetadataExtracter.extract(alfrescoModelReader, checkAlfrescoModelProperties);
// Check the values
assertEquals("Gavin Cornwell", checkAlfrescoModelProperties.get(ContentModel.PROP_AUTHOR));
assertEquals("fm:forummodel", checkAlfrescoModelProperties.get(ContentModel.PROP_TITLE));
assertEquals("Forum Model", checkAlfrescoModelProperties.get(ContentModel.PROP_DESCRIPTION));
// Pass the Eclipse Project xml to the extractor
PropertyMap checkEclipseProjectProperties = new PropertyMap();
xmlMetadataExtracter.extract(eclipseProjectReader, checkEclipseProjectProperties);
// Check the values
assertEquals("Repository", checkEclipseProjectProperties.get(ContentModel.PROP_TITLE));
assertEquals("JavaCC Nature", checkEclipseProjectProperties.get(ContentModel.PROP_DESCRIPTION));
}
/**
* Tests metadata extraction using an action with an EAGER MetadataExtracter for XML.
*/
public void testLifecycleOfXmlMetadataExtraction() throws Exception
{
NodeService nodeService = serviceRegistry.getNodeService();
ContentService contentService = serviceRegistry.getContentService();
ActionExecuter executer = (ActionExecuter) ctx.getBean("extract-metadata");
Action action = new ActionImpl(null, GUID.generate(), SetPropertyValueActionExecuter.NAME, null);
StoreRef storeRef = new StoreRef("test", getName());
NodeRef rootNodeRef = null;
if (nodeService.exists(storeRef))
{
rootNodeRef = nodeService.getRootNode(storeRef);
}
else
{
nodeService.createStore("test", getName());
rootNodeRef = nodeService.getRootNode(storeRef);
}
// Set up some properties
PropertyMap properties = new PropertyMap();
properties.put(ContentModel.PROP_TITLE, "My title");
properties.put(ContentModel.PROP_DESCRIPTION, "My description");
NodeRef contentNodeRef = nodeService.createNode(
rootNodeRef,
ContentModel.ASSOC_CHILDREN,
QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, getName()),
ContentModel.TYPE_CONTENT,
properties).getChildRef();
// Add some content
ContentReader alfrescoModelReader = getReader(FILE_ALFRESCO_MODEL);
assertTrue(alfrescoModelReader.exists());
ContentWriter writer = contentService.getWriter(contentNodeRef, ContentModel.PROP_CONTENT, true);
writer.setEncoding("UTF-8");
writer.setMimetype(MimetypeMap.MIMETYPE_XML);
writer.putContent(alfrescoModelReader);
// Execute the action
executer.execute(action, contentNodeRef);
// Check the node's properties. The EAGER overwrite policy should have replaced the required
// properties.
String checkTitle = (String) nodeService.getProperty(contentNodeRef, ContentModel.PROP_TITLE);
String checkDescription = (String) nodeService.getProperty(contentNodeRef, ContentModel.PROP_DESCRIPTION);
assertEquals("fm:forummodel", checkTitle);
assertEquals("Forum Model", checkDescription);
}
}

View File

@@ -22,8 +22,9 @@
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content;
package org.alfresco.repo.content.selector;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;

View File

@@ -22,7 +22,7 @@
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata.xml;
package org.alfresco.repo.content.selector;
import java.io.InputStream;
import java.util.Collections;
@@ -33,11 +33,11 @@ import java.util.Set;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.alfresco.repo.content.ContentWorkerSelector;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.metadata.MetadataExtracter;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.util.PropertyCheck;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.Attributes;
@@ -48,33 +48,38 @@ import org.xml.sax.helpers.DefaultHandler;
* A selector that looks at the root node of an XML document to determine which worker to provide.
* There are many ways to identify XML documents and this is probably the simplest. Alternate
* implementations might execute a series of xpath statements or look for specific namespace
* declarations in the document. The net result is the same, i.e. given an XML document, an
* extracter is provided to the caller.
* <p>
* In this selector, there is no guarantee that the different extracters will generate the same
* (or even nearly the same) metadata. It is up to the configurer to ensure that if it is a
* requirement, but otherwise each extracter is responsible for its own mappings. Mostly, though,
* a root node match will imply a structure that has the necessary metadata.
* declarations in the document. The net result is the same, i.e. given an XML document, a
* worker is provided to the caller.
*
* @since 2.1
* @author Derek Hulley
*/
public class RootElementNameMetadataExtracterSelector
public class RootElementNameContentWorkerSelector<W extends ContentWorker>
extends DefaultHandler
implements ContentWorkerSelector<MetadataExtracter>
implements ContentWorkerSelector<ContentWorker>
{
private static Log logger = LogFactory.getLog(RootElementNameMetadataExtracterSelector.class);
private static Log logger = LogFactory.getLog(RootElementNameContentWorkerSelector.class);
private SAXParserFactory saxParserFactory;
private Set<String> supportedMimetypes;
private Map<String, MetadataExtracter> extractersByRootElementName;
private Map<String, W> workersByRootElementName;
public RootElementNameMetadataExtracterSelector()
public RootElementNameContentWorkerSelector()
{
saxParserFactory = SAXParserFactory.newInstance();
supportedMimetypes = new HashSet<String>();
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
extractersByRootElementName = Collections.emptyMap();
workersByRootElementName = Collections.emptyMap();
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder(50);
sb.append("RootElementNameContentWorkerSelector")
.append("[ workers=").append(workersByRootElementName)
.append("]");
return sb.toString();
}
/**
@@ -89,26 +94,35 @@ public class RootElementNameMetadataExtracterSelector
}
/**
* Set the extractors to use.
* Set the workers to choose from.
*
* @param extracters a map of {@linkplain MetadataExtracter} instances
* @param workers a map of {@linkplain ContentWorker} instances
* keyed by root element name
*/
public void setExtracters(Map<String, MetadataExtracter> extracters)
public void setWorkers(Map<String, W> workers)
{
this.extractersByRootElementName = extracters;
this.workersByRootElementName = workers;
}
/**
* Performs a match of the root element name to find the correct extracter.
* Checks the configuration.
*/
public MetadataExtracter getWorker(ContentReader reader)
public void init()
{
PropertyCheck.mandatory(this, "workers", workersByRootElementName);
PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
}
/**
* Performs a match of the root element name to find the correct content worker.
*/
public W getWorker(ContentReader reader)
{
if (!supportedMimetypes.contains(reader.getMimetype()))
{
return null;
}
MetadataExtracter extracter = null;
W worker = null;
InputStream is = null;
String rootElementName = null;
try
@@ -121,11 +135,15 @@ public class RootElementNameMetadataExtracterSelector
catch (RootElementFoundException e)
{
rootElementName = e.getElementName();
extracter = extractersByRootElementName.get(rootElementName);
worker = workersByRootElementName.get(rootElementName);
}
catch (Throwable e)
{
throw new ContentIOException("Failed to extract root element from XML document", e);
throw new ContentIOException("\n" +
"Failed to extract root element from XML document: \n" +
" Reader: " + reader + "\n" +
" Selector: " + this,
e);
}
finally
{
@@ -138,18 +156,18 @@ public class RootElementNameMetadataExtracterSelector
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Chosen metadata extracter for reader: \n" +
"Chosen content worker for reader: \n" +
" Reader: " + reader + "\n" +
" Root Element: " + rootElementName + "\n" +
" Extracter: " + extracter);
" Worker: " + worker);
}
return extracter;
return worker;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
{
throw new RootElementFoundException(localName);
throw new RootElementFoundException(qName);
}
/**

View File

@@ -0,0 +1,204 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.selector;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.util.PropertyCheck;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
/**
* A selector that executes a set of XPath statements against the XML document to determine
* which content worker to provide. The XPath rules are simple, i.e. if an XML node is
* found by the XPath statement, then it is considered to be a hit and the corresponding
* worker is returned.
* <p>
* Currently, the only namespaces supported are those contained in the XML documents being
* tested.
*
* @since 2.1
* @author Derek Hulley
*/
public class XPathContentWorkerSelector<W extends ContentWorker> implements ContentWorkerSelector
{
private static Log logger = LogFactory.getLog(XPathContentWorkerSelector.class);
private DocumentBuilder documentBuilder;
private XPathFactory xpathFactory;
private Set<String> supportedMimetypes;
private Map<String, W> workersByXPath;
public XPathContentWorkerSelector()
{
try
{
documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
xpathFactory = XPathFactory.newInstance();
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException("Failed to initialize XPathContentWorkerSelector", e);
}
supportedMimetypes = new HashSet<String>();
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder(50);
sb.append("XPathContentWorkerSelector")
.append("[ workers=").append(workersByXPath)
.append("]");
return sb.toString();
}
/**
* Optionally set the mimetypes supported. They must be XML formats that the chosen
* parser will be able to handle.
*
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
*/
public void setSupportedMimetypes(Set<String> supportedMimetypes)
{
this.supportedMimetypes = supportedMimetypes;
}
/**
* Set the workers to use. All the XPath statements provided must be compatible with
* a return value of type {@linkplain XPathConstants#NODE NODE}.
*
* @param workers a map of {@linkplain ContentWorker} instances
* keyed by XPath statements
*/
public void setWorkers(Map<String, W> workers)
{
this.workersByXPath = workers;
}
/**
* Checks the configuration.
*/
public void init()
{
PropertyCheck.mandatory(this, "workers", workersByXPath);
PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
}
/**
* Execute the XPath statements, in order, against the document. Any statements that fail
* to run will be ignored.
*/
public W getWorker(ContentReader reader)
{
if (!supportedMimetypes.contains(reader.getMimetype()))
{
return null;
}
W worker = null;
InputStream is = null;
String xpath = null;
try
{
is = reader.getContentInputStream();
Document doc = documentBuilder.parse(is);
// Execute the statements
worker = processDocument(doc);
}
catch (Throwable e)
{
throw new ContentIOException("\n" +
"Failed to XPaths against XML document: \n" +
" Reader: " + reader + "\n" +
" Selector: " + this,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Chosen content worker for reader: \n" +
" Reader: " + reader + "\n" +
" XPath: " + xpath + "\n" +
" Worker: " + worker);
}
return worker;
}
/**
* Check the given document against the list of XPath statements provided.
*
* @param document the XML document
* @return Returns a content worker that was matched or <tt>null</tt>
*/
private W processDocument(Document doc)
{
for (Map.Entry<String, W> entry : workersByXPath.entrySet())
{
try
{
String xpath = entry.getKey();
W worker = entry.getValue();
// Execute the statement
Object ret = xpathFactory.newXPath().evaluate(xpath, doc, XPathConstants.NODE);
if (ret != null)
{
// We found one
return worker;
}
}
catch (XPathExpressionException e)
{
// We accept this and move on
}
}
// Nothing found
return null;
}
}