diff --git a/source/java/org/alfresco/repo/content/ContentWorker.java b/source/java/org/alfresco/repo/content/ContentWorker.java new file mode 100644 index 0000000000..6b2160517d --- /dev/null +++ b/source/java/org/alfresco/repo/content/ContentWorker.java @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2005-2007 Alfresco Software Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content; + +/** + * An interface instances that operate on content. This is a marker interface + * for specific worker interfaces such as metadata extractors, content transformers + * and so forth. + * + * @see ContentWorkerSelector + * @since 2.1 + * @author Derek Hulley + */ +public interface ContentWorker +{ +} diff --git a/source/java/org/alfresco/repo/content/ContentWorkerSelector.java b/source/java/org/alfresco/repo/content/ContentWorkerSelector.java new file mode 100644 index 0000000000..332d2245eb --- /dev/null +++ b/source/java/org/alfresco/repo/content/ContentWorkerSelector.java @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2005-2007 Alfresco Software Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content; + +import org.alfresco.service.cmr.repository.ContentIOException; +import org.alfresco.service.cmr.repository.ContentReader; + +/** + * An interface instances that are able to identify content based on the + * {@linkplain ContentReader content reader}. This is specifically + * aimed at extractors, transformers, injectors and similar classes. + *

+ * The notion of supplying some type of worker looks a bit odd here, but + * really an instance of this type will act as an optional factory. Also, + * in the context of the calling class, the context and the generics will + * identify exactly which type is returned by the factory. + * + * @since 2.1 + * @author Derek Hulley + */ +public interface ContentWorkerSelector +{ + /** + * Provides an worker appropriate to the given content, if possible. The reader + * should only be used if absolutely required. The caller should always request + * {@linkplain ContentReader#getReader() a new reader} or check the + * {@linkplain ContentReader#isClosed() reader's state}. + * + * @param reader the content reader, providing the actual stream metadata + * and even the stream, if required. + * @return Return a worker that can operate on the content, or null + * if this identifier doesn't support the content. + * @throws ContentIOException + * if the search fails + */ + W getWorker(ContentReader reader); +} diff --git a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java index 8376b45b5d..31ac7acc5b 100644 --- a/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/AbstractMappingMetadataExtracter.java @@ -254,6 +254,10 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * be used to extract values from the documents. The metadata extraction becomes fully * configuration-driven, i.e. declaring further mappings will result in more values being * extracted from the documents. + *

+ * Most extractors will not be using this method. For an example of its use, see the + * {@linkplain OpenDocumentMetadataExtracter OpenDocument extractor}, which uses the mapping + * to select specific user properties from a document. */ protected final Map> getMapping() { @@ -324,7 +328,7 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac for (Map.Entry entry : mappingProperties.entrySet()) { String propertyName = (String) entry.getKey(); - if (propertyName.startsWith("namespace.prefix.")) + if (propertyName.startsWith(NAMESPACE_PROPERTY_PREFIX)) { String prefix = propertyName.substring(17); String namespace = (String) entry.getValue(); @@ -677,15 +681,15 @@ abstract public class AbstractMappingMetadataExtracter implements MetadataExtrac * system administrators can accurately determine how to best enhance or override the * default mapping. *

- * If the default mapping is declared in a properties file, then the - * {@link #readMappingProperties(String)} method can be used to quickly generate the - * return value: - *

-     *      protected Map> getDefaultMapping()
+     * If the default mapping is declared in a properties file other than the one named after
+     * the class, then the {@link #readMappingProperties(String)} method can be used to quickly
+     * generate the return value:
+     * 

+     *      protected Map<> getDefaultMapping()
      *      {
      *          return readMappingProperties(DEFAULT_MAPPING);
      *      }
-     * 
+ *
* The map can also be created in code either statically or during the call. * * @return Returns the default, static mapping. It may not be null. diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java index 96a0e35afb..5fa6e36400 100644 --- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracter.java @@ -28,17 +28,20 @@ import java.io.Serializable; import java.util.Map; import java.util.Set; +import org.alfresco.repo.content.ContentWorker; import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; /** * Interface for document property extracters. + *

+ * Please pardon the incorrect spelling of extractor. * * @author Jesper Steen Møller * @author Derek Hulley */ -public interface MetadataExtracter +public interface MetadataExtracter extends ContentWorker { /** * A enumeration of functional property overwrite policies. These determine whether extracted properties are diff --git a/source/java/org/alfresco/repo/content/metadata/xml/RootElementNameMetadataExtracterSelector.java b/source/java/org/alfresco/repo/content/metadata/xml/RootElementNameMetadataExtracterSelector.java new file mode 100644 index 0000000000..56b59f1736 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/xml/RootElementNameMetadataExtracterSelector.java @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2005-2007 Alfresco Software Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content.metadata.xml; + +import java.io.InputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.alfresco.repo.content.ContentWorkerSelector; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.metadata.MetadataExtracter; +import org.alfresco.service.cmr.repository.ContentIOException; +import org.alfresco.service.cmr.repository.ContentReader; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * A selector that looks at the root node of an XML document to determine which worker to provide. + * There are many ways to identify XML documents and this is probably the simplest. Alternate + * implementations might execute a series of xpath statements or look for specific namespace + * declarations in the document. The net result is the same, i.e. given an XML document, an + * extracter is provided to the caller. + *

+ * In this selector, there is no guarantee that the different extracters will generate the same + * (or even nearly the same) metadata. It is up to the configurer to ensure that if it is a + * requirement, but otherwise each extracter is responsible for its own mappings. Mostly, though, + * a root node match will imply a structure that has the necessary metadata. + * + * @since 2.1 + * @author Derek Hulley + */ +public class RootElementNameMetadataExtracterSelector + extends DefaultHandler + implements ContentWorkerSelector +{ + private static Log logger = LogFactory.getLog(RootElementNameMetadataExtracterSelector.class); + + private SAXParserFactory saxParserFactory; + private Set supportedMimetypes; + private Map extractersByRootElementName; + + public RootElementNameMetadataExtracterSelector() + { + saxParserFactory = SAXParserFactory.newInstance(); + supportedMimetypes = new HashSet(); + supportedMimetypes.add(MimetypeMap.MIMETYPE_XML); + extractersByRootElementName = Collections.emptyMap(); + } + + /** + * Set the SAX parser factory to use to parse the incoming documents. If not set, + * the default system-wide SAX parser factory is used. + * + * @param factoryClassName A {@link SAXParserFactory} class name + */ + public void setSAXParserFactoryClass(String factoryClassName) + { + try + { + saxParserFactory = SAXParserFactory.newInstance( + factoryClassName, + this.getClass().getClassLoader()); + } + catch (Throwable e) + { + throw new IllegalArgumentException("Unable to load SAX parser factory from class: " + factoryClassName); + } + } + + /** + * Optionally set the mimetypes supported. They must be XML formats that the chosen + * parser will be able to handle. + * + * @param supportedMimetypes the list of mimetypes. The default is text/xml. + */ + public void setSupportedMimetypes(Set supportedMimetypes) + { + this.supportedMimetypes = supportedMimetypes; + } + + /** + * Set the extractors to use. + * + * @param extracters a map of {@linkplain MetadataExtracter} instances + * keyed by root element name + */ + public void setExtracters(Map extracters) + { + this.extractersByRootElementName = extracters; + } + + /** + * Performs a match of the root element name to find the correct extracter. + */ + public MetadataExtracter getWorker(ContentReader reader) + { + /* + * Is xml the only mimetype to support? + */ + if (!reader.getMimetype().equals(MimetypeMap.MIMETYPE_XML)) + { + return null; + } + MetadataExtracter extracter = null; + InputStream is = null; + String rootElementName = null; + try + { + is = reader.getContentInputStream(); + SAXParser saxParser = saxParserFactory.newSAXParser(); + saxParser.parse(is, this); + // No match possible + } + catch (RootElementFoundException e) + { + rootElementName = e.getElementName(); + extracter = extractersByRootElementName.get(rootElementName); + } + catch (Throwable e) + { + throw new ContentIOException("Failed to extract root element from XML document", e); + } + finally + { + if (is != null) + { + try { is.close(); } catch (Throwable e) {} + } + } + // Done + if (logger.isDebugEnabled()) + { + logger.debug("\n" + + "Chosen metadata extracter for reader: \n" + + " Reader: " + reader + "\n" + + " Root Element: " + rootElementName + "\n" + + " Extracter: " + extracter); + } + return extracter; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException + { + throw new RootElementFoundException(localName); + } + + /** + * An exception to break out of the XML parsing early + */ + private static class RootElementFoundException extends SAXException + { + private static final long serialVersionUID = 6845880422947198814L; + private String elementName; + public RootElementFoundException(String elementName) + { + super(elementName); + this.elementName = elementName; + } + public String getElementName() + { + return elementName; + } + } +} diff --git a/source/java/org/alfresco/repo/content/metadata/xml/XPathMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/xml/XPathMetadataExtracter.java new file mode 100644 index 0000000000..17a23b5d29 --- /dev/null +++ b/source/java/org/alfresco/repo/content/metadata/xml/XPathMetadataExtracter.java @@ -0,0 +1,302 @@ +/* + * Copyright (C) 2005 Jesper Steen Møller + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content.metadata.xml; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; + +import org.alfresco.error.AlfrescoRuntimeException; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter; +import org.alfresco.service.cmr.repository.ContentReader; +import org.alfresco.service.namespace.QName; +import org.alfresco.util.ParameterCheck; +import org.alfresco.util.PropertyCheck; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.w3c.dom.Document; + +/** + * An extracter that pulls values from XML documents using configurable XPath + * statements. It is not possible to list a default set of mappings - this is + * down to the configuration only. + *

+ * When an instance of this extracter is configured, XPath statements should be + * provided to extract all the available metadata. The implementation is sensitive + * to what is actually requested by the + * {@linkplain AbstractMappingMetadataExtracter#setMapping(Map) configured mapping} + * and will only perform the queries necessary to fulfill the requirements. + *

+ * To summarize, there are two configurations required for this class: + *

+ *

+ * The mapping of document properties to XPaths must look as follows: + *

+ *    # Get the author
+ *    author=/root/author@name
+ * 
+ * + * @author Derek Hulley + */ +public class XPathMetadataExtracter extends AbstractMappingMetadataExtracter implements NamespaceContext +{ + public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_XML}; + + private static Log logger = LogFactory.getLog(XPathMetadataExtracter.class); + + private DocumentBuilder documentBuilder; + private XPathFactory xpathFactory; + private Map namespacesByPrefix; + private Map xpathExpressionMapping; + + /** + * Default constructor + */ + public XPathMetadataExtracter() + { + super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + } + + /** {@inheritDoc} */ + public String getNamespaceURI(String prefix) + { + ParameterCheck.mandatoryString("prefix", prefix); + return namespacesByPrefix.get(prefix); + } + + /** {@inheritDoc} */ + public String getPrefix(String namespaceURI) + { + ParameterCheck.mandatoryString("namespaceURI", namespaceURI); + for (Map.Entry entry : namespacesByPrefix.entrySet()) + { + if (namespaceURI.equals(entry.getValue())) + { + return entry.getKey(); + } + } + return null; + } + + /** {@inheritDoc} */ + public Iterator getPrefixes(String namespaceURI) + { + ParameterCheck.mandatoryString("namespaceURI", namespaceURI); + List prefixes = new ArrayList(2); + for (Map.Entry entry : namespacesByPrefix.entrySet()) + { + if (namespaceURI.equals(entry.getValue())) + { + prefixes.add(entry.getKey()); + } + } + return prefixes.iterator(); + } + + /** + * Set the properties file that maps document properties to the XPath statements + * necessary to retrieve them. + *

+ * The Xpath mapping is of the form: + *

+     * # Namespaces prefixes
+     * namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+     * namespace.prefix.my=http://www....com/alfresco/1.0
+     * 
+     * # Mapping
+     * editor=/cm:some-xpath-1
+     * title=/my:some-xpath-2
+     * 
+ */ + public void setXpathMappingProperties(Properties xpathMappingProperties) + { + namespacesByPrefix = new HashMap(7); + xpathExpressionMapping = new HashMap(17); + readXPathMappingProperties(xpathMappingProperties); + } + + @Override + protected void init() + { + PropertyCheck.mandatory(this, "xpathMappingProperties", xpathExpressionMapping); + try + { + documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + xpathFactory = XPathFactory.newInstance(); + } + catch (Throwable e) + { + throw new AlfrescoRuntimeException("Failed to initialize XML metadata extractor", e); + } + super.init(); + } + + /** + * It is not possible to have any default mappings, but something has to be returned. + * + * @return Returns an empty map + */ + @Override + protected Map> getDefaultMapping() + { + return Collections.emptyMap(); + } + + @Override + protected Map extractRaw(ContentReader reader) throws Throwable + { + InputStream is = null; + try + { + is = reader.getContentInputStream(); + Document doc = documentBuilder.parse(is); + Map rawProperties = processDocument(doc); + if (logger.isDebugEnabled()) + { + logger.debug("\n" + + "Extracted XML metadata: \n" + + " Reader: " + reader + "\n" + + " Results: " + rawProperties); + } + return rawProperties; + } + finally + { + if (is != null) + { + try { is.close(); } catch (IOException e) {} + } + } + } + + /** + * Executes all the necessary XPath statements to extract values. + */ + protected Map processDocument(Document document) throws Throwable + { + Map rawProperties = super.newRawMap(); + + // Execute all the XPaths that we saved + for (Map.Entry element : xpathExpressionMapping.entrySet()) + { + String documentProperty = element.getKey(); + XPathExpression xpathExpression = element.getValue(); + // Execute it + String value = xpathExpression.evaluate(document); + // Put the value + rawProperties.put(documentProperty, value); + } + // Done + return rawProperties; + } + + /** + * A utility method to convert mapping properties to the Map form. + * + * @see #setMappingProperties(Properties) + */ + protected void readXPathMappingProperties(Properties xpathMappingProperties) + { + // Get the namespaces + for (Map.Entry entry : xpathMappingProperties.entrySet()) + { + String propertyName = (String) entry.getKey(); + if (propertyName.startsWith("namespace.prefix.")) + { + String prefix = propertyName.substring(17); + String namespace = (String) entry.getValue(); + namespacesByPrefix.put(prefix, namespace); + } + } + // Get the mapping that will be applied by the base class + Map> finalMapping = getMapping(); + // Create the mapping + for (Map.Entry entry : xpathMappingProperties.entrySet()) + { + String documentProperty = (String) entry.getKey(); + String xpathStr = (String) entry.getValue(); + if (documentProperty.startsWith(NAMESPACE_PROPERTY_PREFIX)) + { + // Ignore these now + continue; + } + // If the property is not going to be mapped, then just ignore it too + if (!finalMapping.containsKey(documentProperty)) + { + continue; + } + // Construct the XPath + XPath xpath = xpathFactory.newXPath(); + xpath.setNamespaceContext(this); + XPathExpression xpathExpression = null; + try + { + xpathExpression = xpath.compile(xpathStr); + } + catch (XPathExpressionException e) + { + throw new AlfrescoRuntimeException( + "Failed to path XPath expression: \n" + + " Document property: " + documentProperty + "\n" + + " XPath: " + xpathStr); + } + // Persist it + xpathExpressionMapping.put(documentProperty, xpathExpression); + if (logger.isDebugEnabled()) + { + logger.debug("Added mapping from " + documentProperty + " to " + xpathExpression); + } + } + // Done + } +} diff --git a/source/java/org/alfresco/repo/content/transform/ContentTransformer.java b/source/java/org/alfresco/repo/content/transform/ContentTransformer.java index 976fec6b26..3a0eec3ab6 100644 --- a/source/java/org/alfresco/repo/content/transform/ContentTransformer.java +++ b/source/java/org/alfresco/repo/content/transform/ContentTransformer.java @@ -26,6 +26,7 @@ package org.alfresco.repo.content.transform; import java.util.Map; +import org.alfresco.repo.content.ContentWorker; import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentWriter; @@ -35,7 +36,7 @@ import org.alfresco.service.cmr.repository.ContentWriter; * * @author Derek Hulley */ -public interface ContentTransformer +public interface ContentTransformer extends ContentWorker { /** * Provides the approximate accuracy with which this transformer can