Big honkin' merge from head. Sheesh!

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/BRANCHES/WCM-DEV2/root@3617 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Britt Park
2006-08-27 01:01:30 +00:00
parent e2c66899cc
commit 8031cc6574
322 changed files with 20776 additions and 6550 deletions

View File

@@ -1,220 +1,220 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
*
* @author Jesper Steen Møller
*/
abstract public class AbstractMetadataExtracter implements MetadataExtracter
{
protected static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class);
private MimetypeService mimetypeService;
private MetadataExtracterRegistry registry;
private Set<String> supportedMimetypes;
private double reliability;
private long extractionTime;
protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime)
{
this.supportedMimetypes = Collections.singleton(supportedMimetype);
this.reliability = reliability;
this.extractionTime = extractionTime;
}
protected AbstractMetadataExtracter(Set<String> supportedMimetypes, double reliability, long extractionTime)
{
this.supportedMimetypes = supportedMimetypes;
this.reliability = reliability;
this.extractionTime = extractionTime;
}
/**
* Set the registry to register with
*
* @param registry a metadata extracter registry
*/
public void setRegistry(MetadataExtracterRegistry registry)
{
this.registry = registry;
}
/**
* Helper setter of the mimetype service. This is not always required.
*
* @param mimetypeService
*/
public void setMimetypeService(MimetypeService mimetypeService)
{
this.mimetypeService = mimetypeService;
}
/**
* @return Returns the mimetype helper
*/
protected MimetypeService getMimetypeService()
{
return mimetypeService;
}
/**
* Registers this instance of the extracter with the registry.
*
* @see #setRegistry(MetadataExtracterRegistry)
*/
public void register()
{
if (registry == null)
{
logger.warn("Property 'registry' has not been set. Ignoring auto-registration: \n" +
" extracter: " + this);
return;
}
registry.register(this);
}
/**
* Default reliability check that returns the reliability as configured by the contstructor
* if the mimetype is in the list of supported mimetypes.
*
* @param mimetype the mimetype to check
*/
public double getReliability(String mimetype)
{
if (supportedMimetypes.contains(mimetype))
return reliability;
else
return 0.0;
}
public long getExtractionTime()
{
return extractionTime;
}
/**
* Checks if the mimetype is supported.
*
* @param reader the reader to check
* @throws AlfrescoRuntimeException if the mimetype is not supported
*/
protected void checkReliability(ContentReader reader)
{
String mimetype = reader.getMimetype();
if (getReliability(mimetype) <= 0.0)
{
throw new AlfrescoRuntimeException(
"Metadata extracter does not support mimetype: \n" +
" reader: " + reader + "\n" +
" supported: " + supportedMimetypes + "\n" +
" extracter: " + this);
}
}
public final void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
{
// check the reliability
checkReliability(reader);
try
{
extractInternal(reader, destination);
}
catch (Throwable e)
{
throw new ContentIOException("Metadata extraction failed: \n" +
" reader: " + reader,
e);
}
finally
{
// check that the reader was closed
if (!reader.isClosed())
{
logger.error("Content reader not closed by metadata extracter: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
}
// done
if (logger.isDebugEnabled())
{
logger.debug("Completed metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
}
/**
* Override to provide the necessary extraction logic. Implementations must ensure that the reader
* is closed before the method exits.
*
* @param reader the source of the content
* @param destination the property map to fill
* @throws Throwable an exception
*/
protected abstract void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable;
/**
* Examines a value or string for nulls and adds it to the map (if
* non-empty)
*
* @param prop Alfresco's <code>ContentModel.PROP_</code> to set.
* @param value Value to set it to
* @param destination Map into which to set it
* @return true, if set, false otherwise
*/
protected boolean trimPut(QName prop, Object value, Map<QName, Serializable> destination)
{
if (value == null)
return false;
if (value instanceof String)
{
String svalue = ((String) value).trim();
if (svalue.length() > 0)
{
destination.put(prop, svalue);
return true;
}
return false;
}
else if (value instanceof Serializable)
{
destination.put(prop, (Serializable) value);
}
else
{
destination.put(prop, value.toString());
}
return true;
}
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.MimetypeService;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
*
* @author Jesper Steen Møller
*/
abstract public class AbstractMetadataExtracter implements MetadataExtracter
{
protected static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class);
private MimetypeService mimetypeService;
private MetadataExtracterRegistry registry;
private Set<String> supportedMimetypes;
private double reliability;
private long extractionTime;
protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime)
{
this.supportedMimetypes = Collections.singleton(supportedMimetype);
this.reliability = reliability;
this.extractionTime = extractionTime;
}
protected AbstractMetadataExtracter(Set<String> supportedMimetypes, double reliability, long extractionTime)
{
this.supportedMimetypes = supportedMimetypes;
this.reliability = reliability;
this.extractionTime = extractionTime;
}
/**
* Set the registry to register with
*
* @param registry a metadata extracter registry
*/
public void setRegistry(MetadataExtracterRegistry registry)
{
this.registry = registry;
}
/**
* Helper setter of the mimetype service. This is not always required.
*
* @param mimetypeService
*/
public void setMimetypeService(MimetypeService mimetypeService)
{
this.mimetypeService = mimetypeService;
}
/**
* @return Returns the mimetype helper
*/
protected MimetypeService getMimetypeService()
{
return mimetypeService;
}
/**
* Registers this instance of the extracter with the registry.
*
* @see #setRegistry(MetadataExtracterRegistry)
*/
public void register()
{
if (registry == null)
{
logger.warn("Property 'registry' has not been set. Ignoring auto-registration: \n" +
" extracter: " + this);
return;
}
registry.register(this);
}
/**
* Default reliability check that returns the reliability as configured by the contstructor
* if the mimetype is in the list of supported mimetypes.
*
* @param mimetype the mimetype to check
*/
public double getReliability(String mimetype)
{
if (supportedMimetypes.contains(mimetype))
return reliability;
else
return 0.0;
}
public long getExtractionTime()
{
return extractionTime;
}
/**
* Checks if the mimetype is supported.
*
* @param reader the reader to check
* @throws AlfrescoRuntimeException if the mimetype is not supported
*/
protected void checkReliability(ContentReader reader)
{
String mimetype = reader.getMimetype();
if (getReliability(mimetype) <= 0.0)
{
throw new AlfrescoRuntimeException(
"Metadata extracter does not support mimetype: \n" +
" reader: " + reader + "\n" +
" supported: " + supportedMimetypes + "\n" +
" extracter: " + this);
}
}
public final void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
{
// check the reliability
checkReliability(reader);
try
{
extractInternal(reader, destination);
}
catch (Throwable e)
{
throw new ContentIOException("Metadata extraction failed: \n" +
" reader: " + reader,
e);
}
finally
{
// check that the reader was closed
if (!reader.isClosed())
{
logger.error("Content reader not closed by metadata extracter: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
}
// done
if (logger.isDebugEnabled())
{
logger.debug("Completed metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
}
/**
* Override to provide the necessary extraction logic. Implementations must ensure that the reader
* is closed before the method exits.
*
* @param reader the source of the content
* @param destination the property map to fill
* @throws Throwable an exception
*/
protected abstract void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable;
/**
* Examines a value or string for nulls and adds it to the map (if
* non-empty)
*
* @param prop Alfresco's <code>ContentModel.PROP_</code> to set.
* @param value Value to set it to
* @param destination Map into which to set it
* @return true, if set, false otherwise
*/
protected boolean trimPut(QName prop, Object value, Map<QName, Serializable> destination)
{
if (value == null)
return false;
if (value instanceof String)
{
String svalue = ((String) value).trim();
if (svalue.length() > 0)
{
destination.put(prop, svalue);
return true;
}
return false;
}
else if (value instanceof Serializable)
{
destination.put(prop, (Serializable) value);
}
else
{
destination.put(prop, value.toString());
}
return true;
}
}

View File

@@ -1,116 +1,116 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.alfresco.util.ApplicationContextHelper;
import org.alfresco.util.TempFileProvider;
import org.springframework.context.ApplicationContext;
/**
* @see org.alfresco.repo.content.metadata.MetadataExtracter
* @see org.alfresco.repo.content.metadata.AbstractMetadataExtracter
*
* @author Jesper Steen Møller
*/
public abstract class AbstractMetadataExtracterTest extends TestCase
{
private static ApplicationContext ctx = ApplicationContextHelper.getApplicationContext();
protected static final String QUICK_TITLE = "The quick brown fox jumps over the lazy dog";
protected static final String QUICK_DESCRIPTION = "Gym class featuring a brown fox and lazy dog";
protected static final String QUICK_CREATOR = "Nevin Nollop";
protected MimetypeMap mimetypeMap;
protected abstract MetadataExtracter getExtracter();
/**
* Ensures that the temp locations are cleaned out before the tests start
*/
@Override
public void setUp() throws Exception
{
this.mimetypeMap = (MimetypeMap) ctx.getBean("mimetypeService");
// perform a little cleaning up
long now = System.currentTimeMillis();
TempFileProvider.TempFileCleanerJob.removeFiles(now);
}
/**
* Check that all objects are present
*/
public void testSetUp() throws Exception
{
assertNotNull("MimetypeMap not present", mimetypeMap);
// check that the quick resources are available
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("txt");
assertNotNull("quick.* files should be available from Tests", sourceFile);
}
protected void testExtractFromMimetype(String mimetype) throws Exception
{
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
// check
testCommonMetadata(mimetype, properties);
}
protected Map<QName, Serializable> extractFromMimetype(String mimetype) throws Exception
{
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
// get the extension for the mimetype
String ext = mimetypeMap.getExtension(mimetype);
// attempt to get a source file for each mimetype
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(ext);
if (sourceFile == null)
{
throw new FileNotFoundException("No quick." + ext + " file found for test");
}
// construct a reader onto the source file
ContentReader sourceReader = new FileContentReader(sourceFile);
sourceReader.setMimetype(mimetype);
getExtracter().extract(sourceReader, properties);
return properties;
}
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties)
{
assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE, properties.get(ContentModel.PROP_TITLE));
assertEquals(
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
QUICK_DESCRIPTION, properties.get(ContentModel.PROP_DESCRIPTION));
}
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.content.filestore.FileContentReader;
import org.alfresco.repo.content.transform.AbstractContentTransformerTest;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.alfresco.util.ApplicationContextHelper;
import org.alfresco.util.TempFileProvider;
import org.springframework.context.ApplicationContext;
/**
* @see org.alfresco.repo.content.metadata.MetadataExtracter
* @see org.alfresco.repo.content.metadata.AbstractMetadataExtracter
*
* @author Jesper Steen Møller
*/
public abstract class AbstractMetadataExtracterTest extends TestCase
{
private static ApplicationContext ctx = ApplicationContextHelper.getApplicationContext();
protected static final String QUICK_TITLE = "The quick brown fox jumps over the lazy dog";
protected static final String QUICK_DESCRIPTION = "Gym class featuring a brown fox and lazy dog";
protected static final String QUICK_CREATOR = "Nevin Nollop";
protected MimetypeMap mimetypeMap;
protected abstract MetadataExtracter getExtracter();
/**
* Ensures that the temp locations are cleaned out before the tests start
*/
@Override
public void setUp() throws Exception
{
this.mimetypeMap = (MimetypeMap) ctx.getBean("mimetypeService");
// perform a little cleaning up
long now = System.currentTimeMillis();
TempFileProvider.TempFileCleanerJob.removeFiles(now);
}
/**
* Check that all objects are present
*/
public void testSetUp() throws Exception
{
assertNotNull("MimetypeMap not present", mimetypeMap);
// check that the quick resources are available
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile("txt");
assertNotNull("quick.* files should be available from Tests", sourceFile);
}
protected void testExtractFromMimetype(String mimetype) throws Exception
{
Map<QName, Serializable> properties = extractFromMimetype(mimetype);
// check
testCommonMetadata(mimetype, properties);
}
protected Map<QName, Serializable> extractFromMimetype(String mimetype) throws Exception
{
Map<QName, Serializable> properties = new HashMap<QName, Serializable>();
// get the extension for the mimetype
String ext = mimetypeMap.getExtension(mimetype);
// attempt to get a source file for each mimetype
File sourceFile = AbstractContentTransformerTest.loadQuickTestFile(ext);
if (sourceFile == null)
{
throw new FileNotFoundException("No quick." + ext + " file found for test");
}
// construct a reader onto the source file
ContentReader sourceReader = new FileContentReader(sourceFile);
sourceReader.setMimetype(mimetype);
getExtracter().extract(sourceReader, properties);
return properties;
}
protected void testCommonMetadata(String mimetype, Map<QName, Serializable> properties)
{
assertEquals(
"Property " + ContentModel.PROP_TITLE + " not found for mimetype " + mimetype,
QUICK_TITLE, properties.get(ContentModel.PROP_TITLE));
assertEquals(
"Property " + ContentModel.PROP_DESCRIPTION + " not found for mimetype " + mimetype,
QUICK_DESCRIPTION, properties.get(ContentModel.PROP_DESCRIPTION));
}
}

View File

@@ -1,169 +1,169 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.swing.text.ChangedCharSetException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
*
* @author Jesper Steen Møller
*/
public class HtmlMetadataExtracter extends AbstractMetadataExtracter
{
private static final Set<String> MIMETYPES = new HashSet<String>(5);
static
{
MIMETYPES.add(MimetypeMap.MIMETYPE_HTML);
MIMETYPES.add(MimetypeMap.MIMETYPE_XHTML);
}
public HtmlMetadataExtracter()
{
super(MIMETYPES, 1.0, 1000);
}
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{
final Map<QName, Serializable> tempDestination = new HashMap<QName, Serializable>();
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback()
{
StringBuffer title = null;
boolean inHead = false;
public void handleText(char[] data, int pos)
{
if (title != null)
{
title.append(data);
}
}
public void handleComment(char[] data, int pos)
{
// Perhaps sniff for Office 9+ metadata in here?
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = true;
}
else if (HTML.Tag.TITLE.equals(t) && inHead)
{
title = new StringBuffer();
}
else
handleSimpleTag(t, a, pos);
}
public void handleEndTag(HTML.Tag t, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = false;
}
else if (HTML.Tag.TITLE.equals(t) && title != null)
{
trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination);
title = null;
}
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.META.equals(t))
{
Object nameO = a.getAttribute(HTML.Attribute.NAME);
Object valueO = a.getAttribute(HTML.Attribute.CONTENT);
if (nameO == null || valueO == null)
return;
String name = nameO.toString();
if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author")
|| name.equalsIgnoreCase("dc.creator"))
{
trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination);
}
if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description"))
{
trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination);
}
}
}
public void handleError(String errorMsg, int pos)
{
}
};
String charsetGuess = "UTF-8";
int tries = 0;
while (tries < 3)
{
tempDestination.clear();
Reader r = null;
InputStream cis = null;
try
{
cis = reader.getContentInputStream();
// TODO: for now, use default charset; we should attempt to map from html meta-data
r = new InputStreamReader(cis);
HTMLEditorKit.Parser parser = new ParserDelegator();
parser.parse(r, callback, tries > 0);
destination.putAll(tempDestination);
break;
}
catch (ChangedCharSetException ccse)
{
tries++;
charsetGuess = ccse.getCharSetSpec();
int begin = charsetGuess.indexOf("charset=");
if (begin > 0)
charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length());
reader = reader.getReader();
}
finally
{
if (r != null)
r.close();
if (cis != null)
cis.close();
}
}
}
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.swing.text.ChangedCharSetException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
*
* @author Jesper Steen Møller
*/
public class HtmlMetadataExtracter extends AbstractMetadataExtracter
{
private static final Set<String> MIMETYPES = new HashSet<String>(5);
static
{
MIMETYPES.add(MimetypeMap.MIMETYPE_HTML);
MIMETYPES.add(MimetypeMap.MIMETYPE_XHTML);
}
public HtmlMetadataExtracter()
{
super(MIMETYPES, 1.0, 1000);
}
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{
final Map<QName, Serializable> tempDestination = new HashMap<QName, Serializable>();
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback()
{
StringBuffer title = null;
boolean inHead = false;
public void handleText(char[] data, int pos)
{
if (title != null)
{
title.append(data);
}
}
public void handleComment(char[] data, int pos)
{
// Perhaps sniff for Office 9+ metadata in here?
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = true;
}
else if (HTML.Tag.TITLE.equals(t) && inHead)
{
title = new StringBuffer();
}
else
handleSimpleTag(t, a, pos);
}
public void handleEndTag(HTML.Tag t, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = false;
}
else if (HTML.Tag.TITLE.equals(t) && title != null)
{
trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination);
title = null;
}
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.META.equals(t))
{
Object nameO = a.getAttribute(HTML.Attribute.NAME);
Object valueO = a.getAttribute(HTML.Attribute.CONTENT);
if (nameO == null || valueO == null)
return;
String name = nameO.toString();
if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author")
|| name.equalsIgnoreCase("dc.creator"))
{
trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination);
}
if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description"))
{
trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination);
}
}
}
public void handleError(String errorMsg, int pos)
{
}
};
String charsetGuess = "UTF-8";
int tries = 0;
while (tries < 3)
{
tempDestination.clear();
Reader r = null;
InputStream cis = null;
try
{
cis = reader.getContentInputStream();
// TODO: for now, use default charset; we should attempt to map from html meta-data
r = new InputStreamReader(cis);
HTMLEditorKit.Parser parser = new ParserDelegator();
parser.parse(r, callback, tries > 0);
destination.putAll(tempDestination);
break;
}
catch (ChangedCharSetException ccse)
{
tries++;
charsetGuess = ccse.getCharSetSpec();
int begin = charsetGuess.indexOf("charset=");
if (begin > 0)
charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length());
reader = reader.getReader();
}
finally
{
if (r != null)
r.close();
if (cis != null)
cis.close();
}
}
}
}

View File

@@ -1,57 +1,57 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap;
/**
* @author Jesper Steen Møller
*/
public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private MetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new HtmlMetadataExtracter();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testReliability() throws Exception
{
double reliability = 0.0;
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertEquals("Mimetype text should not be supported", 0.0, reliability);
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_HTML);
assertEquals("HTML should be supported", 1.0, reliability);
}
public void testHtmlExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_HTML);
}
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap;
/**
* @author Jesper Steen Møller
*/
public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private MetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new HtmlMetadataExtracter();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testReliability() throws Exception
{
double reliability = 0.0;
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertEquals("Mimetype text should not be supported", 0.0, reliability);
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_HTML);
assertEquals("HTML should be supported", 1.0, reliability);
}
public void testHtmlExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_HTML);
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2005 Jesper Steen M<>ller
* Copyright (C) 2005 Alfresco, Inc.
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
@@ -26,6 +26,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.NamespaceService;
@@ -45,17 +46,8 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
public static String[] SUPPORTED_MIMETYPES = new String[] {
"message/rfc822"};
private static final String SUBSTG_MESSAGEBODY = "__substg1.0_1000001E";
private static final String SUBSTG_RECIPIENTEMAIL = "__substg1.0_39FE001E";
private static final String SUBSTG_RECEIVEDEMAIL = "__substg1.0_0076001E";
private static final String SUBSTG_SENDEREMAIL = "__substg1.0_0C1F001E";
private static final String SUBSTG_DATE = "__substg1.0_00470102";
private static final QName ASPECT_MAILED = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "emailed");
private static final QName PROP_SENTDATE = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "sentdate");
private static final QName PROP_ORIGINATOR = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "originator");
private static final QName PROP_ADDRESSEE = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "addressee");
private static final QName PROP_ADDRESSEES = QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, "addressees");
private static final String STREAM_PREFIX = "__substg1.0_";
private static final int STREAM_PREFIX_LENGTH = STREAM_PREFIX.length();
// the CC: email addresses
private ThreadLocal<List<String>> receipientEmails = new ThreadLocal<List<String>>();
@@ -73,47 +65,10 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
{
try
{
String name = event.getName();
if (name.equals(SUBSTG_RECIPIENTEMAIL)) // a recipient email address
if (event.getName().startsWith(STREAM_PREFIX))
{
String emailAddress = readPlainTextStream(event.getStream());
receipientEmails.get().add(convertExchangeAddress(emailAddress));
}
else if (name.equals(SUBSTG_RECEIVEDEMAIL)) // receiver email address
{
String emailAddress = readPlainTextStream(event.getStream());
destination.put(PROP_ADDRESSEE, convertExchangeAddress(emailAddress));
}
else if (name.equals(SUBSTG_SENDEREMAIL)) // sender email - NOTE either email OR full Exchange data e.g. : /O=HOSTEDSERVICE2/OU=FIRST ADMINISTRATIVE GROUP/CN=RECIPIENTS/CN=MIKE.FARMAN@BEN
{
String emailAddress = readPlainTextStream(event.getStream());
destination.put(PROP_ORIGINATOR, convertExchangeAddress(emailAddress));
}
else if (name.equals(SUBSTG_DATE))
{
// the date is not really plain text - but it's easier to parse as such
String date = readPlainTextStream(event.getStream());
int valueIndex = date.indexOf("l=");
if (valueIndex != -1)
{
int dateIndex = date.indexOf('-', valueIndex);
if (dateIndex != -1)
{
dateIndex++;
String strYear = date.substring(dateIndex, dateIndex + 2);
int year = Integer.parseInt(strYear) + (2000 - 1900);
String strMonth = date.substring(dateIndex + 2, dateIndex + 4);
int month = Integer.parseInt(strMonth) - 1;
String strDay = date.substring(dateIndex + 4, dateIndex + 6);
int day = Integer.parseInt(strDay);
String strHour = date.substring(dateIndex + 6, dateIndex + 8);
int hour = Integer.parseInt(strHour);
String strMinute = date.substring(dateIndex + 10, dateIndex + 12);
int minute = Integer.parseInt(strMinute);
destination.put(PROP_SENTDATE, new Date(year, month, day, hour, minute));
}
}
StreamHandler handler = new StreamHandler(event.getName(), event.getStream());
handler.process(destination);
}
}
catch (Exception ex)
@@ -145,7 +100,7 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
// store multi-value extracted property
if (receipientEmails.get().size() != 0)
{
destination.put(PROP_ADDRESSEES, (Serializable)receipientEmails.get());
destination.put(ContentModel.PROP_ADDRESSEES, (Serializable)receipientEmails.get());
}
}
finally
@@ -157,14 +112,6 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
}
}
private static String readPlainTextStream(DocumentInputStream stream)
throws IOException
{
byte[] data = new byte[stream.available()];
int read = stream.read(data);
return new String(data);
}
private static String convertExchangeAddress(String email)
{
if (email.lastIndexOf("/CN=") == -1)
@@ -177,4 +124,111 @@ public class MailMetadataExtracter extends AbstractMetadataExtracter
return email.substring(email.lastIndexOf("/CN=") + 4);
}
}
private static final String ENCODING_TEXT = "001E";
private static final String ENCODING_BINARY = "0102";
private static final String ENCODING_UNICODE = "001F";
private static final String SUBSTG_MESSAGEBODY = "1000";
private static final String SUBSTG_RECIPIENTEMAIL = "39FE";
private static final String SUBSTG_RECEIVEDEMAIL = "0076";
private static final String SUBSTG_SENDEREMAIL = "0C1F";
private static final String SUBSTG_DATE = "0047";
private static final String SUBSTG_SUBJECT = "0037";
/**
* Class to handle stream types. Can process and extract specific streams.
*/
private class StreamHandler
{
StreamHandler(String name, DocumentInputStream stream)
{
this.type = name.substring(STREAM_PREFIX_LENGTH, STREAM_PREFIX_LENGTH + 4);
this.encoding = name.substring(STREAM_PREFIX_LENGTH + 4, STREAM_PREFIX_LENGTH + 8);
this.stream = stream;
}
void process(final Map<QName, Serializable> destination)
throws IOException
{
if (type.equals(SUBSTG_SENDEREMAIL))
{
destination.put(ContentModel.PROP_ORIGINATOR, convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_RECIPIENTEMAIL))
{
receipientEmails.get().add(convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_RECEIVEDEMAIL))
{
destination.put(ContentModel.PROP_ADDRESSEE, convertExchangeAddress(extractText()));
}
else if (type.equals(SUBSTG_SUBJECT))
{
destination.put(ContentModel.PROP_SUBJECT, extractText());
}
else if (type.equals(SUBSTG_DATE))
{
// the date is not really plain text - but it's easier to parse as such
String date = extractText();
int valueIndex = date.indexOf("l=");
if (valueIndex != -1)
{
int dateIndex = date.indexOf('-', valueIndex);
if (dateIndex != -1)
{
dateIndex++;
String strYear = date.substring(dateIndex, dateIndex + 2);
int year = Integer.parseInt(strYear) + (2000 - 1900);
String strMonth = date.substring(dateIndex + 2, dateIndex + 4);
int month = Integer.parseInt(strMonth) - 1;
String strDay = date.substring(dateIndex + 4, dateIndex + 6);
int day = Integer.parseInt(strDay);
String strHour = date.substring(dateIndex + 6, dateIndex + 8);
int hour = Integer.parseInt(strHour);
String strMinute = date.substring(dateIndex + 10, dateIndex + 12);
int minute = Integer.parseInt(strMinute);
destination.put(ContentModel.PROP_SENTDATE, new Date(year, month, day, hour, minute));
}
}
}
}
/**
* Extract the text from the stream based on the encoding
*
* @return String
*
* @throws IOException
*/
private String extractText()
throws IOException
{
byte[] data = new byte[stream.available()];
stream.read(data);
if (this.encoding.equals(ENCODING_TEXT) || this.encoding.equals(ENCODING_BINARY))
{
return new String(data);
}
else if (this.encoding.equals(ENCODING_UNICODE))
{
// convert double-byte encoding to single byte for String conversion
byte[] b = new byte[data.length >> 1];
for (int i=0; i<b.length; i++)
{
b[i] = data[i << 1];
}
return new String(b);
}
else
{
return new String(data);
}
}
private String type;
private String encoding;
private DocumentInputStream stream;
}
}

View File

@@ -1,72 +1,72 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
*
* @author Jesper Steen Møller
*/
public interface MetadataExtracter
{
/**
* Provides the approximate accuracy with which this extracter can extract
* metadata for the mimetype.
* <p>
*
* @param sourceMimetype the source mimetype
* @return Returns a score 0.0 to 1.0. 0.0 indicates that the extraction
* cannot be performed at all. 1.0 indicates that the extraction can
* be performed perfectly.
*/
public double getReliability(String sourceMimetype);
/**
* Provides an estimate, usually a worst case guess, of how long an
* extraction will take.
* <p>
* This method is used to determine, up front, which of a set of equally
* reliant transformers will be used for a specific extraction.
*
* @return Returns the approximate number of milliseconds per transformation
*/
public long getExtractionTime();
/**
* Extracts the metadata from the content provided by the reader and source
* mimetype to the supplied map.
* <p>
* The extraction viability can be determined by an up front call to
* {@link #getReliability(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
*
* @param reader the source of the content
* @param destination the destination of the extraction
* @throws ContentIOException if an IO exception occurs
*/
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
/**
*
* @author Jesper Steen Møller
*/
public interface MetadataExtracter
{
/**
* Provides the approximate accuracy with which this extracter can extract
* metadata for the mimetype.
* <p>
*
* @param sourceMimetype the source mimetype
* @return Returns a score 0.0 to 1.0. 0.0 indicates that the extraction
* cannot be performed at all. 1.0 indicates that the extraction can
* be performed perfectly.
*/
public double getReliability(String sourceMimetype);
/**
* Provides an estimate, usually a worst case guess, of how long an
* extraction will take.
* <p>
* This method is used to determine, up front, which of a set of equally
* reliant transformers will be used for a specific extraction.
*
* @return Returns the approximate number of milliseconds per transformation
*/
public long getExtractionTime();
/**
* Extracts the metadata from the content provided by the reader and source
* mimetype to the supplied map.
* <p>
* The extraction viability can be determined by an up front call to
* {@link #getReliability(String)}.
* <p>
* The source mimetype <b>must</b> be available on the
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
* of the reader.
*
* @param reader the source of the content
* @param destination the destination of the extraction
* @throws ContentIOException if an IO exception occurs
*/
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
}

View File

@@ -1,191 +1,172 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.repo.content.MimetypeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Holds and provides the most appropriate metadate extracter for a particular
* mimetype.
* <p>
* The extracters themselves know how well they are able to extract metadata.
*
* @see org.alfresco.repo.content.metadata.MetadataExtracter
* @author Jesper Steen Møller
*/
public class MetadataExtracterRegistry
{
private static final Log logger = LogFactory.getLog(MetadataExtracterRegistry.class);
private List<MetadataExtracter> extracters;
private Map<String, MetadataExtracter> extracterCache;
private MimetypeMap mimetypeMap;
/** Controls read access to the cache */
private Lock extracterCacheReadLock;
/** controls write access to the cache */
private Lock extracterCacheWriteLock;
public MetadataExtracterRegistry()
{
// initialise lists
extracters = new ArrayList<MetadataExtracter>(10);
extracterCache = new HashMap<String, MetadataExtracter>(17);
// create lock objects for access to the cache
ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock();
extracterCacheReadLock = extractionCacheLock.readLock();
extracterCacheWriteLock = extractionCacheLock.writeLock();
}
/**
* The mimetype map that will be used to check requests against
*
* @param mimetypeMap a map of mimetypes
*/
public void setMimetypeMap(MimetypeMap mimetypeMap)
{
this.mimetypeMap = mimetypeMap;
}
/**
* Register an instance of an extracter for use
*
* @param extracter an extracter
*/
public void register(MetadataExtracter extracter)
{
if (logger.isDebugEnabled())
{
logger.debug("Registering metadata extracter: " + extracter);
}
extracterCacheWriteLock.lock();
try
{
extracters.add(extracter);
extracterCache.clear();
}
finally
{
extracterCacheWriteLock.unlock();
}
}
/**
* Gets the best metadata extracter. This is a combination of the most
* reliable and the most performant extracter.
* <p>
* The result is cached for quicker access next time.
*
* @param mimetype the source MIME of the extraction
* @return Returns a metadata extracter that can extract metadata from the
* chosen MIME type.
*/
public MetadataExtracter getExtracter(String sourceMimetype)
{
// check that the mimetypes are valid
if (!mimetypeMap.getMimetypes().contains(sourceMimetype))
{
throw new AlfrescoRuntimeException("Unknown extraction source mimetype: " + sourceMimetype);
}
MetadataExtracter extracter = null;
extracterCacheReadLock.lock();
try
{
if (extracterCache.containsKey(sourceMimetype))
{
// the translation has been requested before
// it might have been null
return extracterCache.get(sourceMimetype);
}
}
finally
{
extracterCacheReadLock.unlock();
}
// the translation has not been requested before
// get a write lock on the cache
// no double check done as it is not an expensive task
extracterCacheWriteLock.lock();
try
{
// find the most suitable transformer - may be empty list
extracter = findBestExtracter(sourceMimetype);
// store the result even if it is null
extracterCache.put(sourceMimetype, extracter);
return extracter;
}
finally
{
extracterCacheWriteLock.unlock();
}
}
/**
* @param sourceMimetype The MIME type under examination
* @return The fastest of the most reliable extracters in <code>extracters</code>
* for the given MIME type, or null if none is available.
*/
private MetadataExtracter findBestExtracter(String sourceMimetype)
{
double bestReliability = -1;
long bestTime = Long.MAX_VALUE;
logger.debug("Finding best extracter for " + sourceMimetype);
MetadataExtracter bestExtracter = null;
for (MetadataExtracter ext : extracters)
{
double r = ext.getReliability(sourceMimetype);
if (r <= 0.0)
{
// extraction not achievable
continue;
}
else if (r == bestReliability)
{
long time = ext.getExtractionTime();
if (time < bestTime)
{
bestExtracter = ext;
bestTime = time;
}
}
else if (r > bestReliability)
{
bestExtracter = ext;
bestReliability = r;
bestTime = ext.getExtractionTime();
}
}
return bestExtracter;
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Holds and provides the most appropriate metadate extracter for a particular
* mimetype.
* <p>
* The extracters themselves know how well they are able to extract metadata.
*
* @see org.alfresco.repo.content.metadata.MetadataExtracter
* @author Jesper Steen Møller
*/
public class MetadataExtracterRegistry
{
private static final Log logger = LogFactory.getLog(MetadataExtracterRegistry.class);
private List<MetadataExtracter> extracters;
private Map<String, MetadataExtracter> extracterCache;
/** Controls read access to the cache */
private Lock extracterCacheReadLock;
/** controls write access to the cache */
private Lock extracterCacheWriteLock;
public MetadataExtracterRegistry()
{
// initialise lists
extracters = new ArrayList<MetadataExtracter>(10);
extracterCache = new HashMap<String, MetadataExtracter>(17);
// create lock objects for access to the cache
ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock();
extracterCacheReadLock = extractionCacheLock.readLock();
extracterCacheWriteLock = extractionCacheLock.writeLock();
}
/**
* Register an instance of an extracter for use
*
* @param extracter an extracter
*/
public void register(MetadataExtracter extracter)
{
if (logger.isDebugEnabled())
{
logger.debug("Registering metadata extracter: " + extracter);
}
extracterCacheWriteLock.lock();
try
{
extracters.add(extracter);
extracterCache.clear();
}
finally
{
extracterCacheWriteLock.unlock();
}
}
/**
* Gets the best metadata extracter. This is a combination of the most
* reliable and the most performant extracter.
* <p>
* The result is cached for quicker access next time.
*
* @param mimetype the source MIME of the extraction
* @return Returns a metadata extracter that can extract metadata from the
* chosen MIME type.
*/
public MetadataExtracter getExtracter(String sourceMimetype)
{
MetadataExtracter extracter = null;
extracterCacheReadLock.lock();
try
{
if (extracterCache.containsKey(sourceMimetype))
{
// the translation has been requested before
// it might have been null
return extracterCache.get(sourceMimetype);
}
}
finally
{
extracterCacheReadLock.unlock();
}
// the translation has not been requested before
// get a write lock on the cache
// no double check done as it is not an expensive task
extracterCacheWriteLock.lock();
try
{
// find the most suitable transformer - may be empty list
extracter = findBestExtracter(sourceMimetype);
// store the result even if it is null
extracterCache.put(sourceMimetype, extracter);
return extracter;
}
finally
{
extracterCacheWriteLock.unlock();
}
}
/**
* @param sourceMimetype The MIME type under examination
* @return The fastest of the most reliable extracters in <code>extracters</code>
* for the given MIME type, or null if none is available.
*/
private MetadataExtracter findBestExtracter(String sourceMimetype)
{
double bestReliability = -1;
long bestTime = Long.MAX_VALUE;
logger.debug("Finding best extracter for " + sourceMimetype);
MetadataExtracter bestExtracter = null;
for (MetadataExtracter ext : extracters)
{
double r = ext.getReliability(sourceMimetype);
if (r <= 0.0)
{
// extraction not achievable
continue;
}
else if (r == bestReliability)
{
long time = ext.getExtractionTime();
if (time < bestTime)
{
bestExtracter = ext;
bestTime = time;
}
}
else if (r > bestReliability)
{
bestExtracter = ext;
bestReliability = r;
bestTime = ext.getExtractionTime();
}
}
return bestExtracter;
}
}

View File

@@ -1,101 +1,101 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
/**
* Office file format Metadata Extracter
*
* @author Jesper Steen Møller
*/
public class OfficeMetadataExtracter extends AbstractMetadataExtracter
{
public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_WORD,
MimetypeMap.MIMETYPE_EXCEL,
MimetypeMap.MIMETYPE_PPT};
public OfficeMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)), 1.0, 1000);
}
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
{
POIFSReaderListener readerListener = new POIFSReaderListener()
{
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
{
try
{
PropertySet ps = PropertySetFactory.create(event.getStream());
if (ps instanceof SummaryInformation)
{
SummaryInformation si = (SummaryInformation) ps;
// Titled aspect
trimPut(ContentModel.PROP_TITLE, si.getTitle(), destination);
trimPut(ContentModel.PROP_DESCRIPTION, si.getSubject(), destination);
// Auditable aspect
trimPut(ContentModel.PROP_CREATED, si.getCreateDateTime(), destination);
trimPut(ContentModel.PROP_MODIFIED, si.getLastSaveDateTime(), destination);
trimPut(ContentModel.PROP_AUTHOR, si.getAuthor(), destination);
}
}
catch (Exception ex)
{
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
}
}
};
InputStream is = null;
try
{
is = reader.getContentInputStream();
POIFSReader poiFSReader = new POIFSReader();
poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
poiFSReader.read(is);
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
}
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
/**
* Office file format Metadata Extracter
*
* @author Jesper Steen Møller
*/
public class OfficeMetadataExtracter extends AbstractMetadataExtracter
{
public static String[] SUPPORTED_MIMETYPES = new String[] {
MimetypeMap.MIMETYPE_WORD,
MimetypeMap.MIMETYPE_EXCEL,
MimetypeMap.MIMETYPE_PPT};
public OfficeMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)), 1.0, 1000);
}
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
{
POIFSReaderListener readerListener = new POIFSReaderListener()
{
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
{
try
{
PropertySet ps = PropertySetFactory.create(event.getStream());
if (ps instanceof SummaryInformation)
{
SummaryInformation si = (SummaryInformation) ps;
// Titled aspect
trimPut(ContentModel.PROP_TITLE, si.getTitle(), destination);
trimPut(ContentModel.PROP_DESCRIPTION, si.getSubject(), destination);
// Auditable aspect
trimPut(ContentModel.PROP_CREATED, si.getCreateDateTime(), destination);
trimPut(ContentModel.PROP_MODIFIED, si.getLastSaveDateTime(), destination);
trimPut(ContentModel.PROP_AUTHOR, si.getAuthor(), destination);
}
}
catch (Exception ex)
{
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
}
}
};
InputStream is = null;
try
{
is = reader.getContentInputStream();
POIFSReader poiFSReader = new POIFSReader();
poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
poiFSReader.read(is);
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
}
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a

View File

@@ -1,75 +1,75 @@
/*
* Copyright (C) 2005 Jesper Steen M<EFBFBD>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Calendar;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
/**
*
* @author Jesper Steen Møller
*/
public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
{
public PdfBoxMetadataExtracter()
{
super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000);
}
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{
PDDocument pdf = null;
InputStream is = null;
try
{
is = reader.getContentInputStream();
// stream the document in
pdf = PDDocument.load(is);
// Scoop out the metadata
PDDocumentInformation docInfo = pdf.getDocumentInformation();
trimPut(ContentModel.PROP_AUTHOR, docInfo.getAuthor(), destination);
trimPut(ContentModel.PROP_TITLE, docInfo.getTitle(), destination);
trimPut(ContentModel.PROP_DESCRIPTION, docInfo.getSubject(), destination);
Calendar created = docInfo.getCreationDate();
if (created != null)
destination.put(ContentModel.PROP_CREATED, created.getTime());
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
if (pdf != null)
{
try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
}
}
}
}
/*
* Copyright (C) 2005 Jesper Steen Møller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Calendar;
import java.util.Map;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
/**
*
* @author Jesper Steen Møller
*/
public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
{
public PdfBoxMetadataExtracter()
{
super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000);
}
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{
PDDocument pdf = null;
InputStream is = null;
try
{
is = reader.getContentInputStream();
// stream the document in
pdf = PDDocument.load(is);
// Scoop out the metadata
PDDocumentInformation docInfo = pdf.getDocumentInformation();
trimPut(ContentModel.PROP_AUTHOR, docInfo.getAuthor(), destination);
trimPut(ContentModel.PROP_TITLE, docInfo.getTitle(), destination);
trimPut(ContentModel.PROP_DESCRIPTION, docInfo.getSubject(), destination);
Calendar created = docInfo.getCreationDate();
if (created != null)
destination.put(ContentModel.PROP_CREATED, created.getTime());
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
if (pdf != null)
{
try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
}
}
}
}

View File

@@ -1,43 +1,43 @@
package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap;
/**
* @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
*
* @author Jesper Steen Møller
*/
public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private MetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new PdfBoxMetadataExtracter();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testReliability() throws Exception
{
double reliability = 0.0;
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertEquals("Mimetype should not be supported", 0.0, reliability);
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_PDF);
assertEquals("Mimetype should be supported", 1.0, reliability);
}
public void testPdfExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_PDF);
}
}
package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap;
/**
* @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
*
* @author Jesper Steen Møller
*/
public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
{
private MetadataExtracter extracter;
@Override
public void setUp() throws Exception
{
super.setUp();
extracter = new PdfBoxMetadataExtracter();
}
/**
* @return Returns the same transformer regardless - it is allowed
*/
protected MetadataExtracter getExtracter()
{
return extracter;
}
public void testReliability() throws Exception
{
double reliability = 0.0;
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
assertEquals("Mimetype should not be supported", 0.0, reliability);
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_PDF);
assertEquals("Mimetype should be supported", 1.0, reliability);
}
public void testPdfExtraction() throws Exception
{
testExtractFromMimetype(MimetypeMap.MIMETYPE_PDF);
}
}