mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-06-16 17:55:15 +00:00
XPath-based XML metadata extractor
- No tests - Simple root element name redirector Some comments fleshed out on the new mapping metadata extractor git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@5969 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
parent
1b97517ce6
commit
f770bb0190
38
source/java/org/alfresco/repo/content/ContentWorker.java
Normal file
38
source/java/org/alfresco/repo/content/ContentWorker.java
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License
|
||||||
|
* as published by the Free Software Foundation; either version 2
|
||||||
|
* of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
|
||||||
|
* As a special exception to the terms and conditions of version 2.0 of
|
||||||
|
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||||
|
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||||
|
* FLOSS exception. You should have recieved a copy of the text describing
|
||||||
|
* the FLOSS exception, and it is also available here:
|
||||||
|
* http://www.alfresco.com/legal/licensing"
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An interface instances that operate on content. This is a marker interface
|
||||||
|
* for specific <i>worker</i> interfaces such as metadata extractors, content transformers
|
||||||
|
* and so forth.
|
||||||
|
*
|
||||||
|
* @see ContentWorkerSelector
|
||||||
|
* @since 2.1
|
||||||
|
* @author Derek Hulley
|
||||||
|
*/
|
||||||
|
public interface ContentWorker
|
||||||
|
{
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License
|
||||||
|
* as published by the Free Software Foundation; either version 2
|
||||||
|
* of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
|
||||||
|
* As a special exception to the terms and conditions of version 2.0 of
|
||||||
|
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||||
|
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||||
|
* FLOSS exception. You should have recieved a copy of the text describing
|
||||||
|
* the FLOSS exception, and it is also available here:
|
||||||
|
* http://www.alfresco.com/legal/licensing"
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content;
|
||||||
|
|
||||||
|
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An interface instances that are able to identify content based on the
|
||||||
|
* {@linkplain ContentReader content reader}. This is specifically
|
||||||
|
* aimed at extractors, transformers, injectors and similar classes.
|
||||||
|
* <p>
|
||||||
|
* The notion of supplying some type of worker looks a bit odd here, but
|
||||||
|
* really an instance of this type will act as an optional factory. Also,
|
||||||
|
* in the context of the calling class, the context and the generics will
|
||||||
|
* identify exactly which type is returned by the factory.
|
||||||
|
*
|
||||||
|
* @since 2.1
|
||||||
|
* @author Derek Hulley
|
||||||
|
*/
|
||||||
|
public interface ContentWorkerSelector<W extends ContentWorker>
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Provides an worker appropriate to the given content, if possible. The reader
|
||||||
|
* should only be used if absolutely required. The caller should always request
|
||||||
|
* {@linkplain ContentReader#getReader() a new reader} or check the
|
||||||
|
* {@linkplain ContentReader#isClosed() reader's state}.
|
||||||
|
*
|
||||||
|
* @param reader the content reader, providing the actual stream metadata
|
||||||
|
* and even the stream, if required.
|
||||||
|
* @return Return a worker that can operate on the content, or <tt>null</tt>
|
||||||
|
* if this identifier doesn't support the content.
|
||||||
|
* @throws ContentIOException
|
||||||
|
* if the search fails
|
||||||
|
*/
|
||||||
|
W getWorker(ContentReader reader);
|
||||||
|
}
|
@ -254,6 +254,10 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
* be used to extract values from the documents. The metadata extraction becomes fully
|
* be used to extract values from the documents. The metadata extraction becomes fully
|
||||||
* configuration-driven, i.e. declaring further mappings will result in more values being
|
* configuration-driven, i.e. declaring further mappings will result in more values being
|
||||||
* extracted from the documents.
|
* extracted from the documents.
|
||||||
|
* <p>
|
||||||
|
* Most extractors will not be using this method. For an example of its use, see the
|
||||||
|
* {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping
|
||||||
|
* to select specific user properties from a document.
|
||||||
*/
|
*/
|
||||||
protected final Map<String, Set<QName>> getMapping()
|
protected final Map<String, Set<QName>> getMapping()
|
||||||
{
|
{
|
||||||
@ -324,7 +328,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
for (Map.Entry entry : mappingProperties.entrySet())
|
for (Map.Entry entry : mappingProperties.entrySet())
|
||||||
{
|
{
|
||||||
String propertyName = (String) entry.getKey();
|
String propertyName = (String) entry.getKey();
|
||||||
if (propertyName.startsWith("namespace.prefix."))
|
if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX))
|
||||||
{
|
{
|
||||||
String prefix = propertyName.substring(17);
|
String prefix = propertyName.substring(17);
|
||||||
String namespace = (String) entry.getValue();
|
String namespace = (String) entry.getValue();
|
||||||
@ -677,15 +681,15 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac
|
|||||||
* system administrators can accurately determine how to best enhance or override the
|
* system administrators can accurately determine how to best enhance or override the
|
||||||
* default mapping.
|
* default mapping.
|
||||||
* <p>
|
* <p>
|
||||||
* If the default mapping is declared in a properties file, then the
|
* If the default mapping is declared in a properties file other than the one named after
|
||||||
* {@link #readMappingProperties(String)} method can be used to quickly generate the
|
* the class, then the {@link #readMappingProperties(String)} method can be used to quickly
|
||||||
* return value:
|
* generate the return value:
|
||||||
* <pre>
|
* <pre><code>
|
||||||
* protected Map<String, Set<QName>> getDefaultMapping()
|
* protected Map<<String, Set<QName>> getDefaultMapping()
|
||||||
* {
|
* {
|
||||||
* return readMappingProperties(DEFAULT_MAPPING);
|
* return readMappingProperties(DEFAULT_MAPPING);
|
||||||
* }
|
* }
|
||||||
* </pre>
|
* </code></pre>
|
||||||
* The map can also be created in code either statically or during the call.
|
* The map can also be created in code either statically or during the call.
|
||||||
*
|
*
|
||||||
* @return Returns the default, static mapping. It may not be null.
|
* @return Returns the default, static mapping. It may not be null.
|
||||||
|
@ -28,17 +28,20 @@ import java.io.Serializable;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.ContentWorker;
|
||||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
import org.alfresco.service.namespace.QName;
|
import org.alfresco.service.namespace.QName;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Interface for document property extracters.
|
* Interface for document property extracters.
|
||||||
|
* <p>
|
||||||
|
* Please pardon the incorrect spelling of <i>extractor</i>.
|
||||||
*
|
*
|
||||||
* @author Jesper Steen Møller
|
* @author Jesper Steen Møller
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
public interface MetadataExtracter
|
public interface MetadataExtracter extends ContentWorker
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* A enumeration of functional property overwrite policies. These determine whether extracted properties are
|
* A enumeration of functional property overwrite policies. These determine whether extracted properties are
|
||||||
|
@ -0,0 +1,195 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License
|
||||||
|
* as published by the Free Software Foundation; either version 2
|
||||||
|
* of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
|
||||||
|
* As a special exception to the terms and conditions of version 2.0 of
|
||||||
|
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||||
|
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||||
|
* FLOSS exception. You should have recieved a copy of the text describing
|
||||||
|
* the FLOSS exception, and it is also available here:
|
||||||
|
* http://www.alfresco.com/legal/licensing"
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.metadata.xml;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import javax.xml.parsers.SAXParser;
|
||||||
|
import javax.xml.parsers.SAXParserFactory;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.ContentWorkerSelector;
|
||||||
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.repo.content.metadata.MetadataExtracter;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.xml.sax.Attributes;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
import org.xml.sax.helpers.DefaultHandler;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A selector that looks at the root node of an XML document to determine which worker to provide.
|
||||||
|
* There are many ways to identify XML documents and this is probably the simplest. Alternate
|
||||||
|
* implementations might execute a series of xpath statements or look for specific namespace
|
||||||
|
* declarations in the document. The net result is the same, i.e. given an XML document, an
|
||||||
|
* extracter is provided to the caller.
|
||||||
|
* <p>
|
||||||
|
* In this selector, there is no guarantee that the different extracters will generate the same
|
||||||
|
* (or even nearly the same) metadata. It is up to the configurer to ensure that if it is a
|
||||||
|
* requirement, but otherwise each extracter is responsible for its own mappings. Mostly, though,
|
||||||
|
* a root node match will imply a structure that has the necessary metadata.
|
||||||
|
*
|
||||||
|
* @since 2.1
|
||||||
|
* @author Derek Hulley
|
||||||
|
*/
|
||||||
|
public class RootElementNameMetadataExtracterSelector
|
||||||
|
extends DefaultHandler
|
||||||
|
implements ContentWorkerSelector<MetadataExtracter>
|
||||||
|
{
|
||||||
|
private static Log logger = LogFactory.getLog(RootElementNameMetadataExtracterSelector.class);
|
||||||
|
|
||||||
|
private SAXParserFactory saxParserFactory;
|
||||||
|
private Set<String> supportedMimetypes;
|
||||||
|
private Map<String, MetadataExtracter> extractersByRootElementName;
|
||||||
|
|
||||||
|
public RootElementNameMetadataExtracterSelector()
|
||||||
|
{
|
||||||
|
saxParserFactory = SAXParserFactory.newInstance();
|
||||||
|
supportedMimetypes = new HashSet<String>();
|
||||||
|
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
|
||||||
|
extractersByRootElementName = Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the SAX parser factory to use to parse the incoming documents. If not set,
|
||||||
|
* the default system-wide SAX parser factory is used.
|
||||||
|
*
|
||||||
|
* @param factoryClassName A {@link SAXParserFactory} class name
|
||||||
|
*/
|
||||||
|
public void setSAXParserFactoryClass(String factoryClassName)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
saxParserFactory = SAXParserFactory.newInstance(
|
||||||
|
factoryClassName,
|
||||||
|
this.getClass().getClassLoader());
|
||||||
|
}
|
||||||
|
catch (Throwable e)
|
||||||
|
{
|
||||||
|
throw new IllegalArgumentException("Unable to load SAX parser factory from class: " + factoryClassName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optionally set the mimetypes supported. They must be XML formats that the chosen
|
||||||
|
* parser will be able to handle.
|
||||||
|
*
|
||||||
|
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
|
||||||
|
*/
|
||||||
|
public void setSupportedMimetypes(Set<String> supportedMimetypes)
|
||||||
|
{
|
||||||
|
this.supportedMimetypes = supportedMimetypes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the extractors to use.
|
||||||
|
*
|
||||||
|
* @param extracters a map of {@linkplain MetadataExtracter} instances
|
||||||
|
* keyed by root element name
|
||||||
|
*/
|
||||||
|
public void setExtracters(Map<String, MetadataExtracter> extracters)
|
||||||
|
{
|
||||||
|
this.extractersByRootElementName = extracters;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs a match of the root element name to find the correct extracter.
|
||||||
|
*/
|
||||||
|
public MetadataExtracter getWorker(ContentReader reader)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Is xml the only mimetype to support?
|
||||||
|
*/
|
||||||
|
if (!reader.getMimetype().equals(MimetypeMap.MIMETYPE_XML))
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
MetadataExtracter extracter = null;
|
||||||
|
InputStream is = null;
|
||||||
|
String rootElementName = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
is = reader.getContentInputStream();
|
||||||
|
SAXParser saxParser = saxParserFactory.newSAXParser();
|
||||||
|
saxParser.parse(is, this);
|
||||||
|
// No match possible
|
||||||
|
}
|
||||||
|
catch (RootElementFoundException e)
|
||||||
|
{
|
||||||
|
rootElementName = e.getElementName();
|
||||||
|
extracter = extractersByRootElementName.get(rootElementName);
|
||||||
|
}
|
||||||
|
catch (Throwable e)
|
||||||
|
{
|
||||||
|
throw new ContentIOException("Failed to extract root element from XML document", e);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (is != null)
|
||||||
|
{
|
||||||
|
try { is.close(); } catch (Throwable e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Done
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("\n" +
|
||||||
|
"Chosen metadata extracter for reader: \n" +
|
||||||
|
" Reader: " + reader + "\n" +
|
||||||
|
" Root Element: " + rootElementName + "\n" +
|
||||||
|
" Extracter: " + extracter);
|
||||||
|
}
|
||||||
|
return extracter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
|
||||||
|
{
|
||||||
|
throw new RootElementFoundException(localName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An exception to break out of the XML parsing early
|
||||||
|
*/
|
||||||
|
private static class RootElementFoundException extends SAXException
|
||||||
|
{
|
||||||
|
private static final long serialVersionUID = 6845880422947198814L;
|
||||||
|
private String elementName;
|
||||||
|
public RootElementFoundException(String elementName)
|
||||||
|
{
|
||||||
|
super(elementName);
|
||||||
|
this.elementName = elementName;
|
||||||
|
}
|
||||||
|
public String getElementName()
|
||||||
|
{
|
||||||
|
return elementName;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,302 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2005 Jesper Steen Møller
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License
|
||||||
|
* as published by the Free Software Foundation; either version 2
|
||||||
|
* of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||||
|
|
||||||
|
* As a special exception to the terms and conditions of version 2.0 of
|
||||||
|
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||||
|
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||||
|
* FLOSS exception. You should have recieved a copy of the text describing
|
||||||
|
* the FLOSS exception, and it is also available here:
|
||||||
|
* http://www.alfresco.com/legal/licensing"
|
||||||
|
*/
|
||||||
|
package org.alfresco.repo.content.metadata.xml;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Properties;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import javax.xml.namespace.NamespaceContext;
|
||||||
|
import javax.xml.parsers.DocumentBuilder;
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
|
import javax.xml.xpath.XPath;
|
||||||
|
import javax.xml.xpath.XPathExpression;
|
||||||
|
import javax.xml.xpath.XPathExpressionException;
|
||||||
|
import javax.xml.xpath.XPathFactory;
|
||||||
|
|
||||||
|
import org.alfresco.error.AlfrescoRuntimeException;
|
||||||
|
import org.alfresco.repo.content.MimetypeMap;
|
||||||
|
import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter;
|
||||||
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
|
import org.alfresco.service.namespace.QName;
|
||||||
|
import org.alfresco.util.ParameterCheck;
|
||||||
|
import org.alfresco.util.PropertyCheck;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An extracter that pulls values from XML documents using configurable XPath
|
||||||
|
* statements. It is not possible to list a default set of mappings - this is
|
||||||
|
* down to the configuration only.
|
||||||
|
* <p>
|
||||||
|
* When an instance of this extracter is configured, XPath statements should be
|
||||||
|
* provided to extract all the available metadata. The implementation is sensitive
|
||||||
|
* to what is actually requested by the
|
||||||
|
* {@linkplain AbstractMappingMetadataExtracter#setMapping(Map) configured mapping}
|
||||||
|
* and will only perform the queries necessary to fulfill the requirements.
|
||||||
|
* <p>
|
||||||
|
* To summarize, there are two configurations required for this class:
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* A mapping of all reasonable document properties to XPath statements.
|
||||||
|
* See {@link AbstractMappingMetadataExtracter#setMappingProperties(java.util.Properties)}.
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* A mapping of document property names to Alfresco repository model QNames.
|
||||||
|
* See {@link #setXPathMappingProperties(Properties).}
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* The mapping of document properties to XPaths must look as follows:
|
||||||
|
* <pre>
|
||||||
|
* # Get the author
|
||||||
|
* author=/root/author@name
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Derek Hulley
|
||||||
|
*/
|
||||||
|
public class XPathMetadataExtracter extends AbstractMappingMetadataExtracter implements NamespaceContext
|
||||||
|
{
|
||||||
|
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_XML};
|
||||||
|
|
||||||
|
private static Log logger = LogFactory.getLog(XPathMetadataExtracter.class);
|
||||||
|
|
||||||
|
private DocumentBuilder documentBuilder;
|
||||||
|
private XPathFactory xpathFactory;
|
||||||
|
private Map<String, String> namespacesByPrefix;
|
||||||
|
private Map<String, XPathExpression> xpathExpressionMapping;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default constructor
|
||||||
|
*/
|
||||||
|
public XPathMetadataExtracter()
|
||||||
|
{
|
||||||
|
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** {@inheritDoc} */
|
||||||
|
public String getNamespaceURI(String prefix)
|
||||||
|
{
|
||||||
|
ParameterCheck.mandatoryString("prefix", prefix);
|
||||||
|
return namespacesByPrefix.get(prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** {@inheritDoc} */
|
||||||
|
public String getPrefix(String namespaceURI)
|
||||||
|
{
|
||||||
|
ParameterCheck.mandatoryString("namespaceURI", namespaceURI);
|
||||||
|
for (Map.Entry<String, String> entry : namespacesByPrefix.entrySet())
|
||||||
|
{
|
||||||
|
if (namespaceURI.equals(entry.getValue()))
|
||||||
|
{
|
||||||
|
return entry.getKey();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** {@inheritDoc} */
|
||||||
|
public Iterator getPrefixes(String namespaceURI)
|
||||||
|
{
|
||||||
|
ParameterCheck.mandatoryString("namespaceURI", namespaceURI);
|
||||||
|
List<String> prefixes = new ArrayList<String>(2);
|
||||||
|
for (Map.Entry<String, String> entry : namespacesByPrefix.entrySet())
|
||||||
|
{
|
||||||
|
if (namespaceURI.equals(entry.getValue()))
|
||||||
|
{
|
||||||
|
prefixes.add(entry.getKey());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prefixes.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the properties file that maps document properties to the XPath statements
|
||||||
|
* necessary to retrieve them.
|
||||||
|
* <p>
|
||||||
|
* The Xpath mapping is of the form:
|
||||||
|
* <pre>
|
||||||
|
* # Namespaces prefixes
|
||||||
|
* namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||||
|
* namespace.prefix.my=http://www....com/alfresco/1.0
|
||||||
|
*
|
||||||
|
* # Mapping
|
||||||
|
* editor=/cm:some-xpath-1
|
||||||
|
* title=/my:some-xpath-2
|
||||||
|
* </pre>
|
||||||
|
*/
|
||||||
|
public void setXpathMappingProperties(Properties xpathMappingProperties)
|
||||||
|
{
|
||||||
|
namespacesByPrefix = new HashMap<String, String>(7);
|
||||||
|
xpathExpressionMapping = new HashMap<String, XPathExpression>(17);
|
||||||
|
readXPathMappingProperties(xpathMappingProperties);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void init()
|
||||||
|
{
|
||||||
|
PropertyCheck.mandatory(this, "xpathMappingProperties", xpathExpressionMapping);
|
||||||
|
try
|
||||||
|
{
|
||||||
|
documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
|
||||||
|
xpathFactory = XPathFactory.newInstance();
|
||||||
|
}
|
||||||
|
catch (Throwable e)
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException("Failed to initialize XML metadata extractor", e);
|
||||||
|
}
|
||||||
|
super.init();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* It is not possible to have any default mappings, but something has to be returned.
|
||||||
|
*
|
||||||
|
* @return Returns an empty map
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected Map<String, Set<QName>> getDefaultMapping()
|
||||||
|
{
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||||
|
{
|
||||||
|
InputStream is = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
is = reader.getContentInputStream();
|
||||||
|
Document doc = documentBuilder.parse(is);
|
||||||
|
Map<String, Serializable> rawProperties = processDocument(doc);
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("\n" +
|
||||||
|
"Extracted XML metadata: \n" +
|
||||||
|
" Reader: " + reader + "\n" +
|
||||||
|
" Results: " + rawProperties);
|
||||||
|
}
|
||||||
|
return rawProperties;
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (is != null)
|
||||||
|
{
|
||||||
|
try { is.close(); } catch (IOException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes all the necessary XPath statements to extract values.
|
||||||
|
*/
|
||||||
|
protected Map<String, Serializable> processDocument(Document document) throws Throwable
|
||||||
|
{
|
||||||
|
Map<String, Serializable> rawProperties = super.newRawMap();
|
||||||
|
|
||||||
|
// Execute all the XPaths that we saved
|
||||||
|
for (Map.Entry<String, XPathExpression> element : xpathExpressionMapping.entrySet())
|
||||||
|
{
|
||||||
|
String documentProperty = element.getKey();
|
||||||
|
XPathExpression xpathExpression = element.getValue();
|
||||||
|
// Execute it
|
||||||
|
String value = xpathExpression.evaluate(document);
|
||||||
|
// Put the value
|
||||||
|
rawProperties.put(documentProperty, value);
|
||||||
|
}
|
||||||
|
// Done
|
||||||
|
return rawProperties;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A utility method to convert mapping properties to the Map form.
|
||||||
|
*
|
||||||
|
* @see #setMappingProperties(Properties)
|
||||||
|
*/
|
||||||
|
protected void readXPathMappingProperties(Properties xpathMappingProperties)
|
||||||
|
{
|
||||||
|
// Get the namespaces
|
||||||
|
for (Map.Entry entry : xpathMappingProperties.entrySet())
|
||||||
|
{
|
||||||
|
String propertyName = (String) entry.getKey();
|
||||||
|
if (propertyName.startsWith("namespace.prefix."))
|
||||||
|
{
|
||||||
|
String prefix = propertyName.substring(17);
|
||||||
|
String namespace = (String) entry.getValue();
|
||||||
|
namespacesByPrefix.put(prefix, namespace);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Get the mapping that will be applied by the base class
|
||||||
|
Map<String, Set<QName>> finalMapping = getMapping();
|
||||||
|
// Create the mapping
|
||||||
|
for (Map.Entry entry : xpathMappingProperties.entrySet())
|
||||||
|
{
|
||||||
|
String documentProperty = (String) entry.getKey();
|
||||||
|
String xpathStr = (String) entry.getValue();
|
||||||
|
if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX))
|
||||||
|
{
|
||||||
|
// Ignore these now
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// If the property is not going to be mapped, then just ignore it too
|
||||||
|
if (!finalMapping.containsKey(documentProperty))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Construct the XPath
|
||||||
|
XPath xpath = xpathFactory.newXPath();
|
||||||
|
xpath.setNamespaceContext(this);
|
||||||
|
XPathExpression xpathExpression = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
xpathExpression = xpath.compile(xpathStr);
|
||||||
|
}
|
||||||
|
catch (XPathExpressionException e)
|
||||||
|
{
|
||||||
|
throw new AlfrescoRuntimeException(
|
||||||
|
"Failed to path XPath expression: \n" +
|
||||||
|
" Document property: " + documentProperty + "\n" +
|
||||||
|
" XPath: " + xpathStr);
|
||||||
|
}
|
||||||
|
// Persist it
|
||||||
|
xpathExpressionMapping.put(documentProperty, xpathExpression);
|
||||||
|
if (logger.isDebugEnabled())
|
||||||
|
{
|
||||||
|
logger.debug("Added mapping from " + documentProperty + " to " + xpathExpression);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Done
|
||||||
|
}
|
||||||
|
}
|
@ -26,6 +26,7 @@ package org.alfresco.repo.content.transform;
|
|||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.alfresco.repo.content.ContentWorker;
|
||||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||||
import org.alfresco.service.cmr.repository.ContentReader;
|
import org.alfresco.service.cmr.repository.ContentReader;
|
||||||
import org.alfresco.service.cmr.repository.ContentWriter;
|
import org.alfresco.service.cmr.repository.ContentWriter;
|
||||||
@ -35,7 +36,7 @@ import org.alfresco.service.cmr.repository.ContentWriter;
|
|||||||
*
|
*
|
||||||
* @author Derek Hulley
|
* @author Derek Hulley
|
||||||
*/
|
*/
|
||||||
public interface ContentTransformer
|
public interface ContentTransformer extends ContentWorker
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Provides the approximate accuracy with which this transformer can
|
* Provides the approximate accuracy with which this transformer can
|
||||||
|
Loading…
x
Reference in New Issue
Block a user