Derek Hulley 55a6e2f287 XML metadata extraction with sample.
Added tests into build.

This is now ready for testing, comments and suggestions.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@6056 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2007-06-21 16:09:03 +00:00

309 lines
14 KiB
Java

/*
* Copyright (C) 2005 Jesper Steen Møller
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* As a special exception to the terms and conditions of version 2.0 of
* the GPL, you may redistribute this Program in connection with Free/Libre
* and Open Source Software ("FLOSS") applications as described in Alfresco's
* FLOSS exception. You should have recieved a copy of the text describing
* the FLOSS exception, and it is also available here:
* http://www.alfresco.com/legal/licensing"
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.alfresco.repo.content.ContentWorker;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
* Interface for document property extracters.
* <p>
* Please pardon the incorrect spelling of <i>extractor</i>.
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public interface MetadataExtracter extends ContentWorker
{
/**
* A enumeration of functional property overwrite policies. These determine whether extracted properties are
* written into the property map or not.
*
* @author Derek Hulley
* @author Jesper Steen Møller
*/
public enum OverwritePolicy
{
/**
* This policy puts the new value if:
* <ul>
* <li>the extracted property is not null</li>
* </ul>
*/
EAGER
{
@Override
public Map<QName, Serializable> applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
Map<QName, Serializable> modifiedProperties = new HashMap<QName, Serializable>(7);
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable extractedValue = entry.getValue();
// Ignore null extracted value
if (extractedValue == null)
{
continue;
}
targetProperties.put(propertyQName, extractedValue);
modifiedProperties.put(propertyQName, extractedValue);
}
return modifiedProperties;
}
},
/**
* This policy puts the new value if:
* <ul>
* <li>the extracted property is not null</li>
* <li>there is no target key for the property</li>
* <li>the target value is null</li>
* <li>the string representation of the target value is an empty string</li>
* </ul>
*/
PRAGMATIC
{
@Override
public Map<QName, Serializable> applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
/*
* Negative and positive checks are mixed in the loop.
*/
Map<QName, Serializable> modifiedProperties = new HashMap<QName, Serializable>(7);
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable extractedValue = entry.getValue();
// Ignore null extracted value
if (extractedValue == null)
{
continue;
}
// Handle the shortcut cases where the target value is missing or null
if (!targetProperties.containsKey(propertyQName))
{
// There is nothing currently
targetProperties.put(propertyQName, extractedValue);
modifiedProperties.put(propertyQName, extractedValue);
continue;
}
Serializable originalValue = targetProperties.get(propertyQName);
if (originalValue == null)
{
// The current value is null
targetProperties.put(propertyQName, extractedValue);
modifiedProperties.put(propertyQName, extractedValue);
continue;
}
// Check the string representation
if (originalValue instanceof String)
{
String originalValueStr = (String) originalValue;
if (originalValueStr != null && originalValueStr.length() > 0)
{
// The original value is non-trivial
continue;
}
else
{
// The original string is trivial
targetProperties.put(propertyQName, extractedValue);
modifiedProperties.put(propertyQName, extractedValue);
continue;
}
}
// We have some other object as the original value, so keep it
}
return modifiedProperties;
}
},
/**
* This policy only puts the extracted value if there is no value (null or otherwise) in the properties map.
* It is assumed that the mere presence of a property key is enough to inidicate that the target property
* is as intented.
* This policy puts the new value if:
* <ul>
* <li>the extracted property is not null</li>
* <li>there is no target key for the property</li>
* </ul>
*/
CAUTIOUS
{
@Override
public Map<QName, Serializable> applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
Map<QName, Serializable> modifiedProperties = new HashMap<QName, Serializable>(7);
for (Map.Entry<QName, Serializable> entry : extractedProperties.entrySet())
{
QName propertyQName = entry.getKey();
Serializable extractedValue = entry.getValue();
// Ignore null extracted value
if (extractedValue == null)
{
continue;
}
// Is the key present in the target values
if (targetProperties.containsKey(propertyQName))
{
// Cautiously bypass the value as there is one already
continue;
}
targetProperties.put(propertyQName, extractedValue);
modifiedProperties.put(propertyQName, extractedValue);
}
return modifiedProperties;
}
};
/**
* Apply the overwrite policy for the extracted properties.
*
* @return
* Returns a map of all properties that were applied to the target map. If the result is
* an empty map, then the target map remains unchanged.
*/
public Map<QName, Serializable> applyProperties(Map<QName, Serializable> extractedProperties, Map<QName, Serializable> targetProperties)
{
throw new UnsupportedOperationException("Override this method");
}
};
/**
* Get an estimate of the extracter's reliability on a scale from 0.0 to 1.0.
*
* @param mimetype the mimetype to check
* @return Returns a reliability indicator from 0.0 to 1.0
*
* @deprecated This method is replaced by {@link #isSupported(String)}
*/
public double getReliability(String mimetype);
/**
* Determines if the extracter works against the given mimetype.
*
* @param mimetype the document mimetype
* @return Returns <tt>true</tt> if the mimetype is supported, otherwise <tt>false</tt>.
*/
public boolean isSupported(String mimetype);
/**
* Provides an estimate, usually a worst case guess, of how long an
* extraction will take.
* <p>
* This method is used to determine, up front, which of a set of equally
* reliant transformers will be used for a specific extraction.
*
* @return Returns the approximate number of milliseconds per transformation
*
* @deprecated Generally not useful or used. Extraction is normally specifically configured.
*/
public long getExtractionTime();
/**
* Extracts the metadata values from the content provided by the reader and source
* mimetype to the supplied map. The internal mapping and {@link OverwritePolicy overwrite policy}
* between document metadata and system metadata will be used.
* <p>
* The extraction viability can be determined by an up front call to {@link #isSupported(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
*
* @param reader the source of the content
* @param destination the map of properties to populate (essentially a return value)
* @return Returns a map of all properties on the destination map that were
* added or modified. If the return map is empty, then no properties
* were modified.
* @throws ContentIOException if a detectable error occurs
*
* @see #extract(ContentReader, OverwritePolicy, Map, Map)
*/
public Map<QName, Serializable> extract(ContentReader reader, Map<QName, Serializable> destination);
/**
* Extracts the metadata values from the content provided by the reader and source
* mimetype to the supplied map.
* <p>
* The extraction viability can be determined by an up front call to {@link #isSupported(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
*
* @param reader the source of the content
* @param overwritePolicy the policy stipulating how the system properties must be
* overwritten if present
* @param destination the map of properties to populate (essentially a return value)
* @return Returns a map of all properties on the destination map that were
* added or modified. If the return map is empty, then no properties
* were modified.
* @throws ContentIOException if a detectable error occurs
*
* @see #extract(ContentReader, OverwritePolicy, Map, Map)
*/
public Map<QName, Serializable> extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination);
/**
* Extracts the metadata from the content provided by the reader and source
* mimetype to the supplied map. The mapping from document metadata to system metadata
* is explicitly provided. The {@link OverwritePolicy overwrite policy} is also explictly
* set.
* <p>
* The extraction viability can be determined by an up front call to
* {@link #isSupported(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
*
* @param reader the source of the content
* @param overwritePolicy the policy stipulating how the system properties must be
* overwritten if present
* @param destination the map of properties to populate (essentially a return value)
* @param mapping a mapping of document-specific properties to system properties.
* @return Returns a map of all properties on the destination map that were
* added or modified. If the return map is empty, then no properties
* were modified.
* @throws ContentIOException if a detectable error occurs
*
* @see #extract(ContentReader, Map)
*/
public Map<QName, Serializable> extract(
ContentReader reader,
OverwritePolicy overwritePolicy,
Map<QName, Serializable> destination,
Map<String, Set<QName>> mapping);
}