XPath-based XML metadata extractor

- No tests
 - Simple root element name redirector
Some comments fleshed out on the new mapping metadata extractor


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5969 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley 2007-06-15 02:30:14 +00:00
parent 1b97517ce6
commit f770bb0190
7 changed files with 611 additions and 9 deletions

View File

@ -0,0 +1,38 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content;
/**
* An interface instances that operate on content. This is a marker interface
* for specific <i>worker</i> interfaces such as metadata extractors, content transformers
* and so forth.
*
* @see ContentWorkerSelector
* @since 2.1
* @author Derek Hulley
*/
public interface ContentWorker
{
}

View File

@ -0,0 +1,59 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
/**
* An interface instances that are able to identify content based on the
* {@linkplain ContentReader content reader}. This is specifically
* aimed at extractors, transformers, injectors and similar classes.
* <p>
* The notion of supplying some type of worker looks a bit odd here, but
* really an instance of this type will act as an optional factory. Also,
* in the context of the calling class, the context and the generics will
* identify exactly which type is returned by the factory.
*
* @since 2.1
* @author Derek Hulley
*/
public interface ContentWorkerSelector<W extends ContentWorker>
{
/**
* Provides an worker appropriate to the given content, if possible. The reader
* should only be used if absolutely required. The caller should always request
* {@linkplain ContentReader#getReader() a new reader} or check the
* {@linkplain ContentReader#isClosed() reader's state}.
*
* @param reader the content reader, providing the actual stream metadata
* and even the stream, if required.
* @return Return a worker that can operate on the content, or <tt>null</tt>
* if this identifier doesn't support the content.
* @throws ContentIOException
* if the search fails
*/
W getWorker(ContentReader reader);
}

View File

@ -254,6 +254,10 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* be used to extract values from the documents. The metadata extraction becomes fully
* configuration-driven, i.e. declaring further mappings will result in more values being
* extracted from the documents.
* <p>
* Most extractors will not be using this method. For an example of its use, see the
* {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping
* to select specific user properties from a document.
*/
protected final Map<String, Set<QName>> getMapping()
{
@ -324,7 +328,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
for (Map.Entry entry : mappingProperties.entrySet())
{
String propertyName = (String) entry.getKey();
if (propertyName.startsWith("namespace.prefix."))
if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
String prefix = propertyName.substring(17);
String namespace = (String) entry.getValue();
@ -677,15 +681,15 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
* system administrators can accurately determine how to best enhance or override the
* default mapping.
* <p>
* If the default mapping is declared in a properties file, then the
* {@link #readMappingProperties(String)} method can be used to quickly generate the
* return value:
* <pre>
* protected Map<String, Set<QName>> getDefaultMapping()
* If the default mapping is declared in a properties file other than the one named after
* the class, then the {@link #readMappingProperties(String)} method can be used to quickly
* generate the return value:
* <pre><code>
* protected Map<<String, Set<QName>> getDefaultMapping()
* {
* return readMappingProperties(DEFAULT_MAPPING);
* }
* </pre>
* </code></pre>
* The map can also be created in code either statically or during the call.
*
* @return Returns the default, static mapping. It may not be null.

View File

@ -28,17 +28,20 @@ import java.io.Serializable;
import java.util.Map;
import java.util.Set;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
* Interface for document property extracters.
* <p>
* Please pardon the incorrect spelling of <i>extractor</i>.
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public interface MetadataExtracter
public interface MetadataExtracter extends ContentWorker
{
/**
* A enumeration of functional property overwrite policies. These determine whether extracted properties are

View File

@ -0,0 +1,195 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata.xml;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.alfresco.repo.content.ContentWorkerSelector;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.metadata.MetadataExtracter;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A selector that looks at the root node of an XML document to determine which worker to provide.
* There are many ways to identify XML documents and this is probably the simplest. Alternate
* implementations might execute a series of xpath statements or look for specific namespace
* declarations in the document. The net result is the same, i.e. given an XML document, an
* extracter is provided to the caller.
* <p>
* In this selector, there is no guarantee that the different extracters will generate the same
* (or even nearly the same) metadata. It is up to the configurer to ensure that if it is a
* requirement, but otherwise each extracter is responsible for its own mappings. Mostly, though,
* a root node match will imply a structure that has the necessary metadata.
*
* @since 2.1
* @author Derek Hulley
*/
public class RootElementNameMetadataExtracterSelector
extends DefaultHandler
implements ContentWorkerSelector<MetadataExtracter>
{
private static Log logger = LogFactory.getLog(RootElementNameMetadataExtracterSelector.class);
private SAXParserFactory saxParserFactory;
private Set<String> supportedMimetypes;
private Map<String, MetadataExtracter> extractersByRootElementName;
public RootElementNameMetadataExtracterSelector()
{
saxParserFactory = SAXParserFactory.newInstance();
supportedMimetypes = new HashSet<String>();
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
extractersByRootElementName = Collections.emptyMap();
}
/**
* Set the SAX parser factory to use to parse the incoming documents. If not set,
* the default system-wide SAX parser factory is used.
*
* @param factoryClassName A {@link SAXParserFactory} class name
*/
public void setSAXParserFactoryClass(String factoryClassName)
{
try
{
saxParserFactory = SAXParserFactory.newInstance(
factoryClassName,
this.getClass().getClassLoader());
}
catch (Throwable e)
{
throw new IllegalArgumentException("Unable to load SAX parser factory from class: " + factoryClassName);
}
}
/**
* Optionally set the mimetypes supported. They must be XML formats that the chosen
* parser will be able to handle.
*
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
*/
public void setSupportedMimetypes(Set<String> supportedMimetypes)
{
this.supportedMimetypes = supportedMimetypes;
}
/**
* Set the extractors to use.
*
* @param extracters a map of {@linkplain MetadataExtracter} instances
* keyed by root element name
*/
public void setExtracters(Map<String, MetadataExtracter> extracters)
{
this.extractersByRootElementName = extracters;
}
/**
* Performs a match of the root element name to find the correct extracter.
*/
public MetadataExtracter getWorker(ContentReader reader)
{
/*
* Is xml the only mimetype to support?
*/
if (!reader.getMimetype().equals(MimetypeMap.MIMETYPE_XML))
{
return null;
}
MetadataExtracter extracter = null;
InputStream is = null;
String rootElementName = null;
try
{
is = reader.getContentInputStream();
SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(is, this);
// No match possible
}
catch (RootElementFoundException e)
{
rootElementName = e.getElementName();
extracter = extractersByRootElementName.get(rootElementName);
}
catch (Throwable e)
{
throw new ContentIOException("Failed to extract root element from XML document", e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Chosen metadata extracter for reader: \n" +
" Reader: " + reader + "\n" +
" Root Element: " + rootElementName + "\n" +
" Extracter: " + extracter);
}
return extracter;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
{
throw new RootElementFoundException(localName);
}
/**
* An exception to break out of the XML parsing early
*/
private static class RootElementFoundException extends SAXException
{
private static final long serialVersionUID = 6845880422947198814L;
private String elementName;
public RootElementFoundException(String elementName)
{
super(elementName);
this.elementName = elementName;
}
public String getElementName()
{
return elementName;
}
}
}

View File

@ -0,0 +1,302 @@
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata.xml;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.alfresco.util.ParameterCheck;
import org.alfresco.util.PropertyCheck;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
/**
* An extracter that pulls values from XML documents using configurable XPath
* statements. It is not possible to list a default set of mappings - this is
* down to the configuration only.
* <p>
* When an instance of this extracter is configured, XPath statements should be
* provided to extract all the available metadata. The implementation is sensitive
* to what is actually requested by the
* {@linkplain AbstractMappingMetadataExtracter#setMapping(Map) configured mapping}
* and will only perform the queries necessary to fulfill the requirements.
* <p>
* To summarize, there are two configurations required for this class:
* <ul>
* <li>
* A mapping of all reasonable document properties to XPath statements.
* See {@link AbstractMappingMetadataExtracter#setMappingProperties(java.util.Properties)}.
* </li>
* <li>
* A mapping of document property names to Alfresco repository model QNames.
* See {@link #setXPathMappingProperties(Properties).}
* </li>
* </ul>
* <p>
* The mapping of document properties to XPaths must look as follows:
* <pre>
* # Get the author
* author=/root/author@name
* </pre>
*
* @author Derek Hulley
*/
public class XPathMetadataExtracter extends AbstractMappingMetadataExtracter implements NamespaceContext
{
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_XML};
private static Log logger = LogFactory.getLog(XPathMetadataExtracter.class);
private DocumentBuilder documentBuilder;
private XPathFactory xpathFactory;
private Map<String, String> namespacesByPrefix;
private Map<String, XPathExpression> xpathExpressionMapping;
/**
* Default constructor
*/
public XPathMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
}
/** {@inheritDoc} */
public String getNamespaceURI(String prefix)
{
ParameterCheck.mandatoryString("prefix", prefix);
return namespacesByPrefix.get(prefix);
}
/** {@inheritDoc} */
public String getPrefix(String namespaceURI)
{
ParameterCheck.mandatoryString("namespaceURI", namespaceURI);
for (Map.Entry<String, String> entry : namespacesByPrefix.entrySet())
{
if (namespaceURI.equals(entry.getValue()))
{
return entry.getKey();
}
}
return null;
}
/** {@inheritDoc} */
public Iterator getPrefixes(String namespaceURI)
{
ParameterCheck.mandatoryString("namespaceURI", namespaceURI);
List<String> prefixes = new ArrayList<String>(2);
for (Map.Entry<String, String> entry : namespacesByPrefix.entrySet())
{
if (namespaceURI.equals(entry.getValue()))
{
prefixes.add(entry.getKey());
}
}
return prefixes.iterator();
}
/**
* Set the properties file that maps document properties to the XPath statements
* necessary to retrieve them.
* <p>
* The Xpath mapping is of the form:
* <pre>
* # Namespaces prefixes
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
* namespace.prefix.my=http://www....com/alfresco/1.0
*
* # Mapping
* editor=/cm:some-xpath-1
* title=/my:some-xpath-2
* </pre>
*/
public void setXpathMappingProperties(Properties xpathMappingProperties)
{
namespacesByPrefix = new HashMap<String, String>(7);
xpathExpressionMapping = new HashMap<String, XPathExpression>(17);
readXPathMappingProperties(xpathMappingProperties);
}
@Override
protected void init()
{
PropertyCheck.mandatory(this, "xpathMappingProperties", xpathExpressionMapping);
try
{
documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
xpathFactory = XPathFactory.newInstance();
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException("Failed to initialize XML metadata extractor", e);
}
super.init();
}
/**
* It is not possible to have any default mappings, but something has to be returned.
*
* @return Returns an empty map
*/
@Override
protected Map<String, Set<QName>> getDefaultMapping()
{
return Collections.emptyMap();
}
@Override
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
InputStream is = null;
try
{
is = reader.getContentInputStream();
Document doc = documentBuilder.parse(is);
Map<String, Serializable> rawProperties = processDocument(doc);
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Extracted XML metadata: \n" +
" Reader: " + reader + "\n" +
" Results: " + rawProperties);
}
return rawProperties;
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
}
/**
* Executes all the necessary XPath statements to extract values.
*/
protected Map<String, Serializable> processDocument(Document document) throws Throwable
{
Map<String, Serializable> rawProperties = super.newRawMap();
// Execute all the XPaths that we saved
for (Map.Entry<String, XPathExpression> element : xpathExpressionMapping.entrySet())
{
String documentProperty = element.getKey();
XPathExpression xpathExpression = element.getValue();
// Execute it
String value = xpathExpression.evaluate(document);
// Put the value
rawProperties.put(documentProperty, value);
}
// Done
return rawProperties;
}
/**
* A utility method to convert mapping properties to the Map form.
*
* @see #setMappingProperties(Properties)
*/
protected void readXPathMappingProperties(Properties xpathMappingProperties)
{
// Get the namespaces
for (Map.Entry entry : xpathMappingProperties.entrySet())
{
String propertyName = (String) entry.getKey();
if (propertyName.startsWith("namespace.prefix."))
{
String prefix = propertyName.substring(17);
String namespace = (String) entry.getValue();
namespacesByPrefix.put(prefix, namespace);
}
}
// Get the mapping that will be applied by the base class
Map<String, Set<QName>> finalMapping = getMapping();
// Create the mapping
for (Map.Entry entry : xpathMappingProperties.entrySet())
{
String documentProperty = (String) entry.getKey();
String xpathStr = (String) entry.getValue();
if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
{
// Ignore these now
continue;
}
// If the property is not going to be mapped, then just ignore it too
if (!finalMapping.containsKey(documentProperty))
{
continue;
}
// Construct the XPath
XPath xpath = xpathFactory.newXPath();
xpath.setNamespaceContext(this);
XPathExpression xpathExpression = null;
try
{
xpathExpression = xpath.compile(xpathStr);
}
catch (XPathExpressionException e)
{
throw new AlfrescoRuntimeException(
"Failed to path XPath expression: \n" +
" Document property: " + documentProperty + "\n" +
" XPath: " + xpathStr);
}
// Persist it
xpathExpressionMapping.put(documentProperty, xpathExpression);
if (logger.isDebugEnabled())
{
logger.debug("Added mapping from " + documentProperty + " to " + xpathExpression);
}
}
// Done
}
}

View File

@ -26,6 +26,7 @@ package org.alfresco.repo.content.transform;
import java.util.Map;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
@ -35,7 +36,7 @@ import org.alfresco.service.cmr.repository.ContentWriter;
*
* @author Derek Hulley
*/
public interface ContentTransformer
public interface ContentTransformer extends ContentWorker
{
/**
* Provides the approximate accuracy with which this transformer can