XML metadata extraction with sample.

Added tests into build.

This is now ready for testing, comments and suggestions.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@6056 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2007-06-21 16:09:03 +00:00
parent 757616bc85
commit 55a6e2f287
10 changed files with 789 additions and 53 deletions

View File

@@ -0,0 +1,60 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.selector;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
/**
* An interface instances that are able to identify content based on the
* {@linkplain ContentReader content reader}. This is specifically
* aimed at extractors, transformers, injectors and similar classes.
* <p>
* The notion of supplying some type of worker looks a bit odd here, but
* really an instance of this type will act as an optional factory. Also,
* in the context of the calling class, the context and the generics will
* identify exactly which type is returned by the factory.
*
* @since 2.1
* @author Derek Hulley
*/
public interface ContentWorkerSelector<W extends ContentWorker>
{
/**
* Provides an worker appropriate to the given content, if possible. The reader
* should only be used if absolutely required. The caller should always request
* {@linkplain ContentReader#getReader() a new reader} or check the
* {@linkplain ContentReader#isClosed() reader's state}.
*
* @param reader the content reader, providing the actual stream metadata
* and even the stream, if required.
* @return Return a worker that can operate on the content, or <tt>null</tt>
* if this identifier doesn't support the content.
* @throws ContentIOException
* if the search fails
*/
W getWorker(ContentReader reader);
}

View File

@@ -0,0 +1,190 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.selector;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.util.PropertyCheck;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A selector that looks at the root node of an XML document to determine which worker to provide.
* There are many ways to identify XML documents and this is probably the simplest. Alternate
* implementations might execute a series of xpath statements or look for specific namespace
* declarations in the document. The net result is the same, i.e. given an XML document, a
* worker is provided to the caller.
*
* @since 2.1
* @author Derek Hulley
*/
public class RootElementNameContentWorkerSelector<W extends ContentWorker>
extends DefaultHandler
implements ContentWorkerSelector<ContentWorker>
{
private static Log logger = LogFactory.getLog(RootElementNameContentWorkerSelector.class);
private SAXParserFactory saxParserFactory;
private Set<String> supportedMimetypes;
private Map<String, W> workersByRootElementName;
public RootElementNameContentWorkerSelector()
{
saxParserFactory = SAXParserFactory.newInstance();
supportedMimetypes = new HashSet<String>();
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
workersByRootElementName = Collections.emptyMap();
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder(50);
sb.append("RootElementNameContentWorkerSelector")
.append("[ workers=").append(workersByRootElementName)
.append("]");
return sb.toString();
}
/**
* Optionally set the mimetypes supported. They must be XML formats that the chosen
* parser will be able to handle.
*
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
*/
public void setSupportedMimetypes(Set<String> supportedMimetypes)
{
this.supportedMimetypes = supportedMimetypes;
}
/**
* Set the workers to choose from.
*
* @param workers a map of {@linkplain ContentWorker} instances
* keyed by root element name
*/
public void setWorkers(Map<String, W> workers)
{
this.workersByRootElementName = workers;
}
/**
* Checks the configuration.
*/
public void init()
{
PropertyCheck.mandatory(this, "workers", workersByRootElementName);
PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
}
/**
* Performs a match of the root element name to find the correct content worker.
*/
public W getWorker(ContentReader reader)
{
if (!supportedMimetypes.contains(reader.getMimetype()))
{
return null;
}
W worker = null;
InputStream is = null;
String rootElementName = null;
try
{
is = reader.getContentInputStream();
SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(is, this);
// No match possible
}
catch (RootElementFoundException e)
{
rootElementName = e.getElementName();
worker = workersByRootElementName.get(rootElementName);
}
catch (Throwable e)
{
throw new ContentIOException("\n" +
"Failed to extract root element from XML document: \n" +
" Reader: " + reader + "\n" +
" Selector: " + this,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {}
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Chosen content worker for reader: \n" +
" Reader: " + reader + "\n" +
" Root Element: " + rootElementName + "\n" +
" Worker: " + worker);
}
return worker;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
{
throw new RootElementFoundException(qName);
}
/**
* An exception to break out of the XML parsing early
*/
private static class RootElementFoundException extends SAXException
{
private static final long serialVersionUID = 6845880422947198814L;
private String elementName;
public RootElementFoundException(String elementName)
{
super(elementName);
this.elementName = elementName;
}
public String getElementName()
{
return elementName;
}
}
}

View File

@@ -0,0 +1,204 @@
/*
* Copyright (C) 2005-2007 Alfresco Software Limited.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.selector;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.util.PropertyCheck;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
/**
* A selector that executes a set of XPath statements against the XML document to determine
* which content worker to provide. The XPath rules are simple, i.e. if an XML node is
* found by the XPath statement, then it is considered to be a hit and the corresponding
* worker is returned.
* <p>
* Currently, the only namespaces supported are those contained in the XML documents being
* tested.
*
* @since 2.1
* @author Derek Hulley
*/
public class XPathContentWorkerSelector<W extends ContentWorker> implements ContentWorkerSelector
{
private static Log logger = LogFactory.getLog(XPathContentWorkerSelector.class);
private DocumentBuilder documentBuilder;
private XPathFactory xpathFactory;
private Set<String> supportedMimetypes;
private Map<String, W> workersByXPath;
public XPathContentWorkerSelector()
{
try
{
documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
xpathFactory = XPathFactory.newInstance();
}
catch (Throwable e)
{
throw new AlfrescoRuntimeException("Failed to initialize XPathContentWorkerSelector", e);
}
supportedMimetypes = new HashSet<String>();
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder(50);
sb.append("XPathContentWorkerSelector")
.append("[ workers=").append(workersByXPath)
.append("]");
return sb.toString();
}
/**
* Optionally set the mimetypes supported. They must be XML formats that the chosen
* parser will be able to handle.
*
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
*/
public void setSupportedMimetypes(Set<String> supportedMimetypes)
{
this.supportedMimetypes = supportedMimetypes;
}
/**
* Set the workers to use. All the XPath statements provided must be compatible with
* a return value of type {@linkplain XPathConstants#NODE NODE}.
*
* @param workers a map of {@linkplain ContentWorker} instances
* keyed by XPath statements
*/
public void setWorkers(Map<String, W> workers)
{
this.workersByXPath = workers;
}
/**
* Checks the configuration.
*/
public void init()
{
PropertyCheck.mandatory(this, "workers", workersByXPath);
PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
}
/**
* Execute the XPath statements, in order, against the document. Any statements that fail
* to run will be ignored.
*/
public W getWorker(ContentReader reader)
{
if (!supportedMimetypes.contains(reader.getMimetype()))
{
return null;
}
W worker = null;
InputStream is = null;
String xpath = null;
try
{
is = reader.getContentInputStream();
Document doc = documentBuilder.parse(is);
// Execute the statements
worker = processDocument(doc);
}
catch (Throwable e)
{
throw new ContentIOException("\n" +
"Failed to XPaths against XML document: \n" +
" Reader: " + reader + "\n" +
" Selector: " + this,
e);
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
// Done
if (logger.isDebugEnabled())
{
logger.debug("\n" +
"Chosen content worker for reader: \n" +
" Reader: " + reader + "\n" +
" XPath: " + xpath + "\n" +
" Worker: " + worker);
}
return worker;
}
/**
* Check the given document against the list of XPath statements provided.
*
* @param document the XML document
* @return Returns a content worker that was matched or <tt>null</tt>
*/
private W processDocument(Document doc)
{
for (Map.Entry<String, W> entry : workersByXPath.entrySet())
{
try
{
String xpath = entry.getKey();
W worker = entry.getValue();
// Execute the statement
Object ret = xpathFactory.newXPath().evaluate(xpath, doc, XPathConstants.NODE);
if (ret != null)
{
// We found one
return worker;
}
}
catch (XPathExpressionException e)
{
// We accept this and move on
}
}
// Nothing found
return null;
}
}