mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-24 17:32:48 +00:00
XML metadata extraction with sample.
Added tests into build. This is now ready for testing, comments and suggestions. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@6056 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.selector;
|
||||
|
||||
import org.alfresco.repo.content.ContentWorker;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
|
||||
/**
|
||||
* An interface instances that are able to identify content based on the
|
||||
* {@linkplain ContentReader content reader}. This is specifically
|
||||
* aimed at extractors, transformers, injectors and similar classes.
|
||||
* <p>
|
||||
* The notion of supplying some type of worker looks a bit odd here, but
|
||||
* really an instance of this type will act as an optional factory. Also,
|
||||
* in the context of the calling class, the context and the generics will
|
||||
* identify exactly which type is returned by the factory.
|
||||
*
|
||||
* @since 2.1
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public interface ContentWorkerSelector<W extends ContentWorker>
|
||||
{
|
||||
/**
|
||||
* Provides an worker appropriate to the given content, if possible. The reader
|
||||
* should only be used if absolutely required. The caller should always request
|
||||
* {@linkplain ContentReader#getReader() a new reader} or check the
|
||||
* {@linkplain ContentReader#isClosed() reader's state}.
|
||||
*
|
||||
* @param reader the content reader, providing the actual stream metadata
|
||||
* and even the stream, if required.
|
||||
* @return Return a worker that can operate on the content, or <tt>null</tt>
|
||||
* if this identifier doesn't support the content.
|
||||
* @throws ContentIOException
|
||||
* if the search fails
|
||||
*/
|
||||
W getWorker(ContentReader reader);
|
||||
}
|
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.selector;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import org.alfresco.repo.content.ContentWorker;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.util.PropertyCheck;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
/**
|
||||
* A selector that looks at the root node of an XML document to determine which worker to provide.
|
||||
* There are many ways to identify XML documents and this is probably the simplest. Alternate
|
||||
* implementations might execute a series of xpath statements or look for specific namespace
|
||||
* declarations in the document. The net result is the same, i.e. given an XML document, a
|
||||
* worker is provided to the caller.
|
||||
*
|
||||
* @since 2.1
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class RootElementNameContentWorkerSelector<W extends ContentWorker>
|
||||
extends DefaultHandler
|
||||
implements ContentWorkerSelector<ContentWorker>
|
||||
{
|
||||
private static Log logger = LogFactory.getLog(RootElementNameContentWorkerSelector.class);
|
||||
|
||||
private SAXParserFactory saxParserFactory;
|
||||
private Set<String> supportedMimetypes;
|
||||
private Map<String, W> workersByRootElementName;
|
||||
|
||||
public RootElementNameContentWorkerSelector()
|
||||
{
|
||||
saxParserFactory = SAXParserFactory.newInstance();
|
||||
supportedMimetypes = new HashSet<String>();
|
||||
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
|
||||
workersByRootElementName = Collections.emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
StringBuilder sb = new StringBuilder(50);
|
||||
sb.append("RootElementNameContentWorkerSelector")
|
||||
.append("[ workers=").append(workersByRootElementName)
|
||||
.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Optionally set the mimetypes supported. They must be XML formats that the chosen
|
||||
* parser will be able to handle.
|
||||
*
|
||||
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
|
||||
*/
|
||||
public void setSupportedMimetypes(Set<String> supportedMimetypes)
|
||||
{
|
||||
this.supportedMimetypes = supportedMimetypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the workers to choose from.
|
||||
*
|
||||
* @param workers a map of {@linkplain ContentWorker} instances
|
||||
* keyed by root element name
|
||||
*/
|
||||
public void setWorkers(Map<String, W> workers)
|
||||
{
|
||||
this.workersByRootElementName = workers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the configuration.
|
||||
*/
|
||||
public void init()
|
||||
{
|
||||
PropertyCheck.mandatory(this, "workers", workersByRootElementName);
|
||||
PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a match of the root element name to find the correct content worker.
|
||||
*/
|
||||
public W getWorker(ContentReader reader)
|
||||
{
|
||||
if (!supportedMimetypes.contains(reader.getMimetype()))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
W worker = null;
|
||||
InputStream is = null;
|
||||
String rootElementName = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
SAXParser saxParser = saxParserFactory.newSAXParser();
|
||||
saxParser.parse(is, this);
|
||||
// No match possible
|
||||
}
|
||||
catch (RootElementFoundException e)
|
||||
{
|
||||
rootElementName = e.getElementName();
|
||||
worker = workersByRootElementName.get(rootElementName);
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new ContentIOException("\n" +
|
||||
"Failed to extract root element from XML document: \n" +
|
||||
" Reader: " + reader + "\n" +
|
||||
" Selector: " + this,
|
||||
e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {}
|
||||
}
|
||||
}
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("\n" +
|
||||
"Chosen content worker for reader: \n" +
|
||||
" Reader: " + reader + "\n" +
|
||||
" Root Element: " + rootElementName + "\n" +
|
||||
" Worker: " + worker);
|
||||
}
|
||||
return worker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
|
||||
{
|
||||
throw new RootElementFoundException(qName);
|
||||
}
|
||||
|
||||
/**
|
||||
* An exception to break out of the XML parsing early
|
||||
*/
|
||||
private static class RootElementFoundException extends SAXException
|
||||
{
|
||||
private static final long serialVersionUID = 6845880422947198814L;
|
||||
private String elementName;
|
||||
public RootElementFoundException(String elementName)
|
||||
{
|
||||
super(elementName);
|
||||
this.elementName = elementName;
|
||||
}
|
||||
public String getElementName()
|
||||
{
|
||||
return elementName;
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,204 @@
|
||||
/*
|
||||
* Copyright (C) 2005-2007 Alfresco Software Limited.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
* As a special exception to the terms and conditions of version 2.0 of
|
||||
* the GPL, you may redistribute this Program in connection with Free/Libre
|
||||
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
||||
* FLOSS exception. You should have recieved a copy of the text describing
|
||||
* the FLOSS exception, and it is also available here:
|
||||
* http://www.alfresco.com/legal/licensing"
|
||||
*/
|
||||
package org.alfresco.repo.content.selector;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.repo.content.ContentWorker;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.util.PropertyCheck;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
/**
|
||||
* A selector that executes a set of XPath statements against the XML document to determine
|
||||
* which content worker to provide. The XPath rules are simple, i.e. if an XML node is
|
||||
* found by the XPath statement, then it is considered to be a hit and the corresponding
|
||||
* worker is returned.
|
||||
* <p>
|
||||
* Currently, the only namespaces supported are those contained in the XML documents being
|
||||
* tested.
|
||||
*
|
||||
* @since 2.1
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class XPathContentWorkerSelector<W extends ContentWorker> implements ContentWorkerSelector
|
||||
{
|
||||
private static Log logger = LogFactory.getLog(XPathContentWorkerSelector.class);
|
||||
|
||||
private DocumentBuilder documentBuilder;
|
||||
private XPathFactory xpathFactory;
|
||||
private Set<String> supportedMimetypes;
|
||||
private Map<String, W> workersByXPath;
|
||||
|
||||
public XPathContentWorkerSelector()
|
||||
{
|
||||
try
|
||||
{
|
||||
documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
|
||||
xpathFactory = XPathFactory.newInstance();
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new AlfrescoRuntimeException("Failed to initialize XPathContentWorkerSelector", e);
|
||||
}
|
||||
supportedMimetypes = new HashSet<String>();
|
||||
supportedMimetypes.add(MimetypeMap.MIMETYPE_XML);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
StringBuilder sb = new StringBuilder(50);
|
||||
sb.append("XPathContentWorkerSelector")
|
||||
.append("[ workers=").append(workersByXPath)
|
||||
.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Optionally set the mimetypes supported. They must be XML formats that the chosen
|
||||
* parser will be able to handle.
|
||||
*
|
||||
* @param supportedMimetypes the list of mimetypes. The default is <b>text/xml</b>.
|
||||
*/
|
||||
public void setSupportedMimetypes(Set<String> supportedMimetypes)
|
||||
{
|
||||
this.supportedMimetypes = supportedMimetypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the workers to use. All the XPath statements provided must be compatible with
|
||||
* a return value of type {@linkplain XPathConstants#NODE NODE}.
|
||||
*
|
||||
* @param workers a map of {@linkplain ContentWorker} instances
|
||||
* keyed by XPath statements
|
||||
*/
|
||||
public void setWorkers(Map<String, W> workers)
|
||||
{
|
||||
this.workersByXPath = workers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the configuration.
|
||||
*/
|
||||
public void init()
|
||||
{
|
||||
PropertyCheck.mandatory(this, "workers", workersByXPath);
|
||||
PropertyCheck.mandatory(this, "supportedMimetypes", supportedMimetypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the XPath statements, in order, against the document. Any statements that fail
|
||||
* to run will be ignored.
|
||||
*/
|
||||
public W getWorker(ContentReader reader)
|
||||
{
|
||||
if (!supportedMimetypes.contains(reader.getMimetype()))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
W worker = null;
|
||||
InputStream is = null;
|
||||
String xpath = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
Document doc = documentBuilder.parse(is);
|
||||
// Execute the statements
|
||||
worker = processDocument(doc);
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new ContentIOException("\n" +
|
||||
"Failed to XPaths against XML document: \n" +
|
||||
" Reader: " + reader + "\n" +
|
||||
" Selector: " + this,
|
||||
e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
// Done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("\n" +
|
||||
"Chosen content worker for reader: \n" +
|
||||
" Reader: " + reader + "\n" +
|
||||
" XPath: " + xpath + "\n" +
|
||||
" Worker: " + worker);
|
||||
}
|
||||
return worker;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the given document against the list of XPath statements provided.
|
||||
*
|
||||
* @param document the XML document
|
||||
* @return Returns a content worker that was matched or <tt>null</tt>
|
||||
*/
|
||||
private W processDocument(Document doc)
|
||||
{
|
||||
for (Map.Entry<String, W> entry : workersByXPath.entrySet())
|
||||
{
|
||||
try
|
||||
{
|
||||
String xpath = entry.getKey();
|
||||
W worker = entry.getValue();
|
||||
// Execute the statement
|
||||
Object ret = xpathFactory.newXPath().evaluate(xpath, doc, XPathConstants.NODE);
|
||||
if (ret != null)
|
||||
{
|
||||
// We found one
|
||||
return worker;
|
||||
}
|
||||
}
|
||||
catch (XPathExpressionException e)
|
||||
{
|
||||
// We accept this and move on
|
||||
}
|
||||
}
|
||||
// Nothing found
|
||||
return null;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user