XML metadata extraction with sample.

Added tests into build. This is now ready for testing, comments and suggestions. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@6056 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-24 17:32:48 +00:00 · 2007-06-21 16:09:03 +00:00
parent 757616bc85
commit 55a6e2f287
10 changed files with 789 additions and 53 deletions
--- a/source/java/org/alfresco/repo/content/selector/ContentWorkerSelector.java
+++ b/source/java/org/alfresco/repo/content/selector/ContentWorkerSelector.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2005-2007 Alfresco Software Limited.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+ * As a special exception to the terms and conditions of version 2.0 of 
+ * the GPL, you may redistribute this Program in connection with Free/Libre 
+ * and Open Source Software ("FLOSS") applications as described in Alfresco's 
+ * FLOSS exception.  You should have recieved a copy of the text describing 
+ * the FLOSS exception, and it is also available here: 
+ * http://www.alfresco.com/legal/licensing"
+ */
+package org.alfresco.repo.content.selector;
+
+import org.alfresco.repo.content.ContentWorker;
+import org.alfresco.service.cmr.repository.ContentIOException;
+import org.alfresco.service.cmr.repository.ContentReader;
+
+/**
+ * An interface instances that are able to identify content based on the
+ * {@linkplain ContentReader content reader}.  This is specifically
+ * aimed at extractors, transformers, injectors and similar classes.
+ * <p>
+ * The notion of supplying some type of worker looks a bit odd here, but
+ * really an instance of this type will act as an optional factory.  Also,
+ * in the context of the calling class, the context and the generics will
+ * identify exactly which type is returned by the factory.
+ * 
+ * @since 2.1
+ * @author Derek Hulley
+ */
+public interface ContentWorkerSelector<W extends ContentWorker>
+{
+    /**
+     * Provides an worker appropriate to the given content, if possible.  The reader
+     * should only be used if absolutely required.  The caller should always request
+     * {@linkplain ContentReader#getReader() a new reader} or check the
+     * {@linkplain ContentReader#isClosed() reader's state}.
+     * 
+     * @param reader        the content reader, providing the actual stream metadata
+     *                      and even the stream, if required.
+     * @return              Return a worker that can operate on the content, or <tt>null</tt>
+     *                      if this identifier doesn't support the content.
+     * @throws ContentIOException
+     *                      if the search fails
+     */
+    W getWorker(ContentReader reader);
+}
--- a/source/java/org/alfresco/repo/content/selector/RootElementNameContentWorkerSelector.java
+++ b/source/java/org/alfresco/repo/content/selector/RootElementNameContentWorkerSelector.java
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C) 2005-2007 Alfresco Software Limited.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+ * As a special exception to the terms and conditions of version 2.0 of 
+ * the GPL, you may redistribute this Program in connection with Free/Libre 
+ * and Open Source Software ("FLOSS") applications as described in Alfresco's 
+ * FLOSS exception.  You should have recieved a copy of the text describing 
+ * the FLOSS exception, and it is also available here: 
+ * http://www.alfresco.com/legal/licensing"
+ */
+package org.alfresco.repo.content.selector;
+
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.alfresco.repo.content.ContentWorker;
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.ContentIOException;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.util.PropertyCheck;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A selector that looks at the root node of an XML document to determine which worker to provide.
+ * There are many ways to identify XML documents and this is probably the simplest.  Alternate
+ * implementations might execute a series of xpath statements or look for specific namespace
+ * declarations in the document.  The net result is the same, i.e. given an XML document, a
+ * worker is provided to the caller.
+ * 
+ * @since 2.1
+ * @author Derek Hulley
+ */
+public class RootElementNameContentWorkerSelector<W extends ContentWorker>
+        extends DefaultHandler
+        implements ContentWorkerSelector<ContentWorker>
+{
+    private static Log logger = LogFactory.getLog(RootElementNameContentWorkerSelector.class);
+    
+    private SAXParserFactory saxParserFactory;
+    private Set<String> supportedMimetypes;
+    private Map<String, W> workersByRootElementName;
+    
+    public RootElementNameContentWorkerSelector()
+    {
+        saxParserFactory = SAXParserFactory.newInstance();
+        supportedMimetypes = new HashSet<String>();
+        supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
+        workersByRootElementName = Collections.emptyMap();
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder(50);
+        sb.append("RootElementNameContentWorkerSelector")
+          .append("[ workers=").append(workersByRootElementName)
+          .append("]");
+        return sb.toString();
+    }
+
+    /**
+     * Optionally set the mimetypes supported.  They must be XML formats that the chosen
+     * parser will be able to handle.
+     * 
+     * @param supportedMimetypes        the list of mimetypes.  The default is <b>text/xml</b>.
+     */
+    public void setSupportedMimetypes(Set<String> supportedMimetypes)
+    {
+        this.supportedMimetypes = supportedMimetypes;
+    }
+
+    /**
+     * Set the workers to choose from.
+     * 
+     * @param workers               a map of {@linkplain ContentWorker} instances
+     *                              keyed by root element name
+     */
+    public void setWorkers(Map<String, W> workers)
+    {
+        this.workersByRootElementName = workers;
+    }
+
+    /**
+     * Checks the configuration.
+     */
+    public void init()
+    {
+        PropertyCheck.mandatory(this, "workers", workersByRootElementName);
+        PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
+    }
+    
+    /**
+     * Performs a match of the root element name to find the correct content worker.
+     */
+    public W getWorker(ContentReader reader)
+    {
+        if (!supportedMimetypes.contains(reader.getMimetype()))
+        {
+            return null;
+        }
+        W worker = null;
+        InputStream is = null;
+        String rootElementName = null;
+        try
+        {
+            is = reader.getContentInputStream();
+            SAXParser saxParser = saxParserFactory.newSAXParser();
+            saxParser.parse(is, this);
+            // No match possible
+        }
+        catch (RootElementFoundException e)
+        {
+            rootElementName = e.getElementName();
+            worker = workersByRootElementName.get(rootElementName);
+        }
+        catch (Throwable e)
+        {
+            throw new ContentIOException("\n" +
+                    "Failed to extract root element from XML document: \n" +
+                    "   Reader:   " + reader + "\n" +
+                    "   Selector: " + this,
+                    e);
+        }
+        finally
+        {
+            if (is != null)
+            {
+                try { is.close(); } catch (Throwable e) {}
+            }
+        }
+        // Done
+        if (logger.isDebugEnabled())
+        {
+            logger.debug("\n" +
+                    "Chosen content worker for reader: \n" +
+                    "   Reader:       " + reader + "\n" +
+                    "   Root Element: " + rootElementName + "\n" +
+                    "   Worker:       " + worker);
+        }
+        return worker;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
+    {
+        throw new RootElementFoundException(qName);
+    }
+
+    /**
+     * An exception to break out of the XML parsing early
+     */
+    private static class RootElementFoundException extends SAXException
+    {
+        private static final long serialVersionUID = 6845880422947198814L;
+        private String elementName;
+        public RootElementFoundException(String elementName)
+        {
+            super(elementName);
+            this.elementName = elementName;
+        }
+        public String getElementName()
+        {
+            return elementName;
+        }
+    }
+}
--- a/source/java/org/alfresco/repo/content/selector/XPathContentWorkerSelector.java
+++ b/source/java/org/alfresco/repo/content/selector/XPathContentWorkerSelector.java
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2005-2007 Alfresco Software Limited.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+ * As a special exception to the terms and conditions of version 2.0 of 
+ * the GPL, you may redistribute this Program in connection with Free/Libre 
+ * and Open Source Software ("FLOSS") applications as described in Alfresco's 
+ * FLOSS exception.  You should have recieved a copy of the text describing 
+ * the FLOSS exception, and it is also available here: 
+ * http://www.alfresco.com/legal/licensing"
+ */
+package org.alfresco.repo.content.selector;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.alfresco.error.AlfrescoRuntimeException;
+import org.alfresco.repo.content.ContentWorker;
+import org.alfresco.repo.content.MimetypeMap;
+import org.alfresco.service.cmr.repository.ContentIOException;
+import org.alfresco.service.cmr.repository.ContentReader;
+import org.alfresco.util.PropertyCheck;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.w3c.dom.Document;
+
+/**
+ * A selector that executes a set of XPath statements against the XML document to determine
+ * which content worker to provide.  The XPath rules are simple, i.e. if an XML node is
+ * found by the XPath statement, then it is considered to be a hit and the corresponding
+ * worker is returned.
+ * <p>
+ * Currently, the only namespaces supported are those contained in the XML documents being
+ * tested.
+ * 
+ * @since 2.1
+ * @author Derek Hulley
+ */
+public class XPathContentWorkerSelector<W extends ContentWorker> implements ContentWorkerSelector
+{
+    private static Log logger = LogFactory.getLog(XPathContentWorkerSelector.class);
+    
+    private DocumentBuilder documentBuilder;
+    private XPathFactory xpathFactory;
+    private Set<String> supportedMimetypes;
+    private Map<String, W> workersByXPath;
+    
+    public XPathContentWorkerSelector()
+    {
+        try
+        {
+            documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
+            xpathFactory = XPathFactory.newInstance();
+        }
+        catch (Throwable e)
+        {
+            throw new AlfrescoRuntimeException("Failed to initialize XPathContentWorkerSelector", e);
+        }
+        supportedMimetypes = new HashSet<String>();
+        supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder(50);
+        sb.append("XPathContentWorkerSelector")
+          .append("[ workers=").append(workersByXPath)
+          .append("]");
+        return sb.toString();
+    }
+
+    /**
+     * Optionally set the mimetypes supported.  They must be XML formats that the chosen
+     * parser will be able to handle.
+     * 
+     * @param supportedMimetypes        the list of mimetypes.  The default is <b>text/xml</b>.
+     */
+    public void setSupportedMimetypes(Set<String> supportedMimetypes)
+    {
+        this.supportedMimetypes = supportedMimetypes;
+    }
+
+    /**
+     * Set the workers to use.  All the XPath statements provided must be compatible with
+     * a return value of type {@linkplain XPathConstants#NODE NODE}.
+     * 
+     * @param workers            a map of {@linkplain ContentWorker} instances
+     *                           keyed by XPath statements
+     */
+    public void setWorkers(Map<String, W> workers)
+    {
+        this.workersByXPath = workers;
+    }
+
+    /**
+     * Checks the configuration.
+     */
+    public void init()
+    {
+        PropertyCheck.mandatory(this, "workers", workersByXPath);
+        PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
+    }
+    
+    /**
+     * Execute the XPath statements, in order, against the document.  Any statements that fail
+     * to run will be ignored.
+     */
+    public W getWorker(ContentReader reader)
+    {
+        if (!supportedMimetypes.contains(reader.getMimetype()))
+        {
+            return null;
+        }
+        W worker = null;
+        InputStream is = null;
+        String xpath = null;
+        try
+        {
+            is = reader.getContentInputStream();
+            Document doc = documentBuilder.parse(is);
+            // Execute the statements
+            worker = processDocument(doc);
+        }
+        catch (Throwable e)
+        {
+            throw new ContentIOException("\n" +
+                    "Failed to XPaths against XML document: \n" +
+                    "   Reader:   " + reader + "\n" +
+                    "   Selector: " + this,
+                    e);
+        }
+        finally
+        {
+            if (is != null)
+            {
+                try { is.close(); } catch (IOException e) {}
+            }
+        }
+        // Done
+        if (logger.isDebugEnabled())
+        {
+            logger.debug("\n" +
+                    "Chosen content worker for reader: \n" +
+                    "   Reader:       " + reader + "\n" +
+                    "   XPath:        " + xpath + "\n" +
+                    "   Worker:    " + worker);
+        }
+        return worker;
+    }
+    
+    /**
+     * Check the given document against the list of XPath statements provided.
+     * 
+     * @param document          the XML document
+     * @return                  Returns a content worker that was matched or <tt>null</tt>
+     */
+    private W processDocument(Document doc)
+    {
+        for (Map.Entry<String, W> entry : workersByXPath.entrySet())
+        {
+            try
+            {
+                String xpath = entry.getKey();
+                W worker = entry.getValue();
+                // Execute the statement
+                Object ret = xpathFactory.newXPath().evaluate(xpath, doc, XPathConstants.NODE);
+                if (ret != null)
+                {
+                    // We found one
+                    return worker;
+                }
+            }
+            catch (XPathExpressionException e)
+            {
+                // We accept this and move on
+            }
+        }
+        // Nothing found
+        return null;
+    }
+}