Inverted configuration of Metadata Extracters

- Adding an extracter no longer requires modification to the MetadataExtracterRegistry
Fixed lack of stream closures

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2465 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Derek Hulley
2006-02-22 11:11:53 +00:00
parent 47ecffb07e
commit 31d9ef768b
13 changed files with 445 additions and 445 deletions

View File

@@ -87,27 +87,35 @@
</constructor-arg> </constructor-arg>
</bean> </bean>
<!-- Metadata Extraction Regisitry -->
<bean id="metadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" > <bean id="metadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" >
<constructor-arg> <property name="mimetypeMap">
<ref bean="mimetypeService" /> <ref bean="mimetypeService" />
</constructor-arg>
<!-- metadata extracters -->
<property name="extracters">
<list>
<bean class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.StringMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.UnoMetadataExtracter" >
<constructor-arg>
<ref bean="mimetypeService" />
</constructor-arg>
</bean>
</list>
</property> </property>
</bean> </bean>
<!-- Abstract bean definition defining base definition for all metadata extracters -->
<bean id="baseMetadataExtracter"
class="org.alfresco.repo.content.metadata.AbstractMetadataExtracter"
abstract="true"
init-method="register">
<property name="registry">
<ref bean="metadataExtracterRegistry" />
</property>
</bean>
<!-- Content Metadata Extracters -->
<bean class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
<bean class="org.alfresco.repo.content.metadata.UnoMetadataExtracter" parent="baseMetadataExtracter" init-method="init" >
<property name="mimetypeMap">
<ref bean="mimetypeService" />
</property>
</bean>
<!-- Content Transformation Regisitry --> <!-- Content Transformation Regisitry -->
<bean id="contentTransformerRegistry" class="org.alfresco.repo.content.transform.ContentTransformerRegistry" > <bean id="contentTransformerRegistry" class="org.alfresco.repo.content.transform.ContentTransformerRegistry" >
@@ -116,9 +124,7 @@
</constructor-arg> </constructor-arg>
</bean> </bean>
<!-- <!-- Abstract bean definition defining base definition for all transformers -->
Abstract bean definition defining base definition for all transformers
-->
<bean id="baseContentTransformer" <bean id="baseContentTransformer"
class="org.alfresco.repo.content.transform.AbstractContentTransformer" class="org.alfresco.repo.content.transform.AbstractContentTransformer"
abstract="true" abstract="true"

View File

@@ -21,7 +21,12 @@ import java.util.Collections;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/** /**
* *
@@ -29,28 +34,60 @@ import org.alfresco.service.namespace.QName;
*/ */
abstract public class AbstractMetadataExtracter implements MetadataExtracter abstract public class AbstractMetadataExtracter implements MetadataExtracter
{ {
private static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class);
private Set<String> mimetypes;
private MetadataExtracterRegistry registry;
private Set<String> supportedMimetypes;
private double reliability; private double reliability;
private long extractionTime; private long extractionTime;
protected AbstractMetadataExtracter(String mimetype, double reliability, long extractionTime) protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime)
{ {
this.mimetypes = Collections.singleton(mimetype); this.supportedMimetypes = Collections.singleton(supportedMimetype);
this.reliability = reliability; this.reliability = reliability;
this.extractionTime = extractionTime; this.extractionTime = extractionTime;
} }
protected AbstractMetadataExtracter(Set<String> mimetypes, double reliability, long extractionTime) protected AbstractMetadataExtracter(Set<String> supportedMimetypes, double reliability, long extractionTime)
{ {
this.mimetypes = mimetypes; this.supportedMimetypes = supportedMimetypes;
this.reliability = reliability; this.reliability = reliability;
this.extractionTime = extractionTime; this.extractionTime = extractionTime;
} }
public double getReliability(String sourceMimetype) /**
* Set the registry to register with
*
* @param registry a metadata extracter registry
*/
public void setRegistry(MetadataExtracterRegistry registry)
{ {
if (mimetypes.contains(sourceMimetype)) this.registry = registry;
}
/**
* Registers this instance of the extracter with the registry.
*
* @see #setRegistry(MetadataExtracterRegistry)
*/
public void register()
{
if (registry == null)
{
throw new IllegalArgumentException("Property 'registry' has not been set");
}
registry.register(this);
}
/**
* Default reliability check that returns the reliability as configured by the contstructor
* if the mimetype is in the list of supported mimetypes.
*
* @param mimetype the mimetype to check
*/
public double getReliability(String mimetype)
{
if (supportedMimetypes.contains(mimetype))
return reliability; return reliability;
else else
return 0.0; return 0.0;
@@ -60,7 +97,69 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
{ {
return extractionTime; return extractionTime;
} }
/**
* Checks if the mimetype is supported.
*
* @param reader the reader to check
* @throws AlfrescoRuntimeException if the mimetype is not supported
*/
protected void checkReliability(ContentReader reader)
{
String mimetype = reader.getMimetype();
if (getReliability(mimetype) <= 0.0)
{
throw new AlfrescoRuntimeException(
"Metadata extracter does not support mimetype: \n" +
" reader: " + reader + "\n" +
" supported: " + supportedMimetypes + "\n" +
" extracter: " + this);
}
}
public final void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
{
// check the reliability
checkReliability(reader);
try
{
extractInternal(reader, destination);
}
catch (Throwable e)
{
throw new ContentIOException("Metadata extraction failed: \n" +
" reader: " + reader + "\n" +
e);
}
finally
{
// check that the reader and writer are both closed
if (!reader.isClosed())
{
logger.error("Content reader not closed by metadata extracter: \n" + reader);
}
}
// done
if (logger.isDebugEnabled())
{
logger.debug("Completed metadata extraction: \n" +
" reader: " + reader + "\n" +
" extracter: " + this);
}
}
/**
* Override to provide the necessary extraction logic. Implementations must ensure that the reader
* is closed before the method exits.
*
* @param reader the source of the content
* @param destination the property map to fill
* @throws Throwable an exception
*/
protected abstract void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable;
/** /**
* Examines a value or string for nulls and adds it to the map (if * Examines a value or string for nulls and adds it to the map (if
* non-empty) * non-empty)

View File

@@ -16,7 +16,6 @@
*/ */
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
@@ -34,7 +33,6 @@ import javax.swing.text.html.parser.ParserDelegator;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
@@ -56,122 +54,116 @@ public class HtmlMetadataExtracter extends AbstractMetadataExtracter
super(MIMETYPES, 1.0, 1000); super(MIMETYPES, 1.0, 1000);
} }
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{ {
final Map<QName, Serializable> tempDestination = new HashMap<QName, Serializable>(); final Map<QName, Serializable> tempDestination = new HashMap<QName, Serializable>();
try
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback()
{ {
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() StringBuffer title = null;
boolean inHead = false;
public void handleText(char[] data, int pos)
{ {
StringBuffer title = null; if (title != null)
boolean inHead = false;
public void handleText(char[] data, int pos)
{ {
if (title != null) title.append(data);
{
title.append(data);
}
}
public void handleComment(char[] data, int pos)
{
// Perhaps sniff for Office 9+ metadata in here?
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = true;
}
else if (HTML.Tag.TITLE.equals(t) && inHead)
{
title = new StringBuffer();
}
else
handleSimpleTag(t, a, pos);
}
public void handleEndTag(HTML.Tag t, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = false;
}
else if (HTML.Tag.TITLE.equals(t) && title != null)
{
trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination);
title = null;
}
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.META.equals(t))
{
Object nameO = a.getAttribute(HTML.Attribute.NAME);
Object valueO = a.getAttribute(HTML.Attribute.CONTENT);
if (nameO == null || valueO == null)
return;
String name = nameO.toString();
if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author")
|| name.equalsIgnoreCase("dc.creator"))
{
trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination);
}
if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description"))
{
trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination);
}
}
}
public void handleError(String errorMsg, int pos)
{
}
};
String charsetGuess = "UTF-8";
int tries = 0;
while (tries < 3)
{
tempDestination.clear();
Reader r = null;
InputStream cis = null;
try
{
cis = reader.getContentInputStream();
// TODO: for now, use default charset; we should attempt to map from html meta-data
r = new InputStreamReader(cis);
HTMLEditorKit.Parser parser = new ParserDelegator();
parser.parse(r, callback, tries > 0);
destination.putAll(tempDestination);
break;
}
catch (ChangedCharSetException ccse)
{
tries++;
charsetGuess = ccse.getCharSetSpec();
int begin = charsetGuess.indexOf("charset=");
if (begin > 0)
charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length());
reader = reader.getReader();
}
finally
{
if (r != null)
r.close();
if (cis != null)
cis.close();
} }
} }
}
catch (IOException e) public void handleComment(char[] data, int pos)
{
// Perhaps sniff for Office 9+ metadata in here?
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = true;
}
else if (HTML.Tag.TITLE.equals(t) && inHead)
{
title = new StringBuffer();
}
else
handleSimpleTag(t, a, pos);
}
public void handleEndTag(HTML.Tag t, int pos)
{
if (HTML.Tag.HEAD.equals(t))
{
inHead = false;
}
else if (HTML.Tag.TITLE.equals(t) && title != null)
{
trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination);
title = null;
}
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
if (HTML.Tag.META.equals(t))
{
Object nameO = a.getAttribute(HTML.Attribute.NAME);
Object valueO = a.getAttribute(HTML.Attribute.CONTENT);
if (nameO == null || valueO == null)
return;
String name = nameO.toString();
if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author")
|| name.equalsIgnoreCase("dc.creator"))
{
trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination);
}
if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description"))
{
trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination);
}
}
}
public void handleError(String errorMsg, int pos)
{
}
};
String charsetGuess = "UTF-8";
int tries = 0;
while (tries < 3)
{ {
throw new ContentIOException("HTML metadata extraction failed: \n" + " reader: " + reader, e); tempDestination.clear();
Reader r = null;
InputStream cis = null;
try
{
cis = reader.getContentInputStream();
// TODO: for now, use default charset; we should attempt to map from html meta-data
r = new InputStreamReader(cis);
HTMLEditorKit.Parser parser = new ParserDelegator();
parser.parse(r, callback, tries > 0);
destination.putAll(tempDestination);
break;
}
catch (ChangedCharSetException ccse)
{
tries++;
charsetGuess = ccse.getCharSetSpec();
int begin = charsetGuess.indexOf("charset=");
if (begin > 0)
charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length());
reader = reader.getReader();
}
finally
{
if (r != null)
r.close();
if (cis != null)
cis.close();
}
} }
} }
} }

View File

@@ -17,16 +17,12 @@
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/** /**
* @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
* @author Jesper Steen M<>ller * @author Jesper Steen M<>ller
*/ */
public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
{ {
private static final Log logger = LogFactory.getLog(HtmlMetadataExtracterTest.class);
private MetadataExtracter extracter; private MetadataExtracter extracter;
public void onSetUpInTransaction() throws Exception public void onSetUpInTransaction() throws Exception

View File

@@ -17,20 +17,17 @@
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.File; import java.io.File;
import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.alfresco.util.GUID; import org.alfresco.util.TempFileProvider;
import org.farng.mp3.AbstractMP3FragmentBody; import org.farng.mp3.AbstractMP3FragmentBody;
import org.farng.mp3.MP3File; import org.farng.mp3.MP3File;
import org.farng.mp3.TagException;
import org.farng.mp3.id3.AbstractID3v2; import org.farng.mp3.id3.AbstractID3v2;
import org.farng.mp3.id3.AbstractID3v2Frame; import org.farng.mp3.id3.AbstractID3v2Frame;
import org.farng.mp3.id3.ID3v1; import org.farng.mp3.id3.ID3v1;
@@ -58,103 +55,88 @@ public class MP3MetadataExtracter extends AbstractMetadataExtracter
super(MimetypeMap.MIMETYPE_MP3, 1.0, 1000); super(MimetypeMap.MIMETYPE_MP3, 1.0, 1000);
} }
/** public void extractInternal(
* @see org.alfresco.repo.content.metadata.MetadataExtracter#extract(org.alfresco.service.cmr.repository.ContentReader, java.util.Map) ContentReader reader,
*/ Map<QName, Serializable> destination) throws Throwable
public void extract(ContentReader reader,
Map<QName, Serializable> destination) throws ContentIOException
{ {
Map<QName, Serializable> props = new HashMap<QName, Serializable>();
// Create a temp file
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
try try
{ {
Map<QName, Serializable> props = new HashMap<QName, Serializable>(); reader.getContent(tempFile);
// Create a temp file // Create the MP3 object from the file
File tempFile = File.createTempFile(GUID.generate(), ".tmp"); MP3File mp3File = new MP3File(tempFile);
try
ID3v1 id3v1 = mp3File.getID3v1Tag();
if (id3v1 != null)
{ {
reader.getContent(tempFile); setTagValue(props, PROP_ALBUM_TITLE, id3v1.getAlbum());
setTagValue(props, PROP_SONG_TITLE, id3v1.getTitle());
setTagValue(props, PROP_ARTIST, id3v1.getArtist());
setTagValue(props, PROP_COMMENT, id3v1.getComment());
setTagValue(props, PROP_YEAR_RELEASED, id3v1.getYear());
// Create the MP3 object from the file // TODO sort out the genre
MP3File mp3File = new MP3File(tempFile); //setTagValue(props, MusicModel.PROP_GENRE, id3v1.getGenre());
ID3v1 id3v1 = mp3File.getID3v1Tag(); // TODO sort out the size
if (id3v1 != null) //setTagValue(props, MusicModel.PROP_SIZE, id3v1.getSize());
{
setTagValue(props, PROP_ALBUM_TITLE, id3v1.getAlbum());
setTagValue(props, PROP_SONG_TITLE, id3v1.getTitle());
setTagValue(props, PROP_ARTIST, id3v1.getArtist());
setTagValue(props, PROP_COMMENT, id3v1.getComment());
setTagValue(props, PROP_YEAR_RELEASED, id3v1.getYear());
// TODO sort out the genre
//setTagValue(props, MusicModel.PROP_GENRE, id3v1.getGenre());
// TODO sort out the size
//setTagValue(props, MusicModel.PROP_SIZE, id3v1.getSize());
}
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
if (id3v2 != null)
{
setTagValue(props, PROP_SONG_TITLE, getID3V2Value(id3v2, "TIT2"));
setTagValue(props, PROP_ARTIST, getID3V2Value(id3v2, "TPE1"));
setTagValue(props, PROP_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"));
setTagValue(props, PROP_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"));
setTagValue(props, PROP_COMMENT, getID3V2Value(id3v2, "COMM"));
setTagValue(props, PROP_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"));
setTagValue(props, PROP_GENRE, getID3V2Value(id3v2, "TCON"));
setTagValue(props, PROP_COMPOSER, getID3V2Value(id3v2, "TCOM"));
// TODO sort out the lyrics
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
}
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
if (lyrics3Tag != null)
{
System.out.println("Lyrics3 tag found.");
if (lyrics3Tag instanceof Lyrics3v2)
{
setTagValue(props, PROP_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"));
setTagValue(props, PROP_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"));
setTagValue(props, PROP_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"));
setTagValue(props, PROP_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"));
setTagValue(props, PROP_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"));
setTagValue(props, PROP_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"));
}
}
}
finally
{
tempFile.delete();
} }
// Set the destination values AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
if (props.get(PROP_SONG_TITLE) != null) if (id3v2 != null)
{ {
destination.put(ContentModel.PROP_TITLE, props.get(PROP_SONG_TITLE)); setTagValue(props, PROP_SONG_TITLE, getID3V2Value(id3v2, "TIT2"));
setTagValue(props, PROP_ARTIST, getID3V2Value(id3v2, "TPE1"));
setTagValue(props, PROP_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"));
setTagValue(props, PROP_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"));
setTagValue(props, PROP_COMMENT, getID3V2Value(id3v2, "COMM"));
setTagValue(props, PROP_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"));
setTagValue(props, PROP_GENRE, getID3V2Value(id3v2, "TCON"));
setTagValue(props, PROP_COMPOSER, getID3V2Value(id3v2, "TCOM"));
// TODO sort out the lyrics
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
} }
if (props.get(PROP_ARTIST) != null)
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
if (lyrics3Tag != null)
{ {
destination.put(ContentModel.PROP_AUTHOR, props.get(PROP_ARTIST)); System.out.println("Lyrics3 tag found.");
if (lyrics3Tag instanceof Lyrics3v2)
{
setTagValue(props, PROP_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"));
setTagValue(props, PROP_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"));
setTagValue(props, PROP_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"));
setTagValue(props, PROP_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"));
setTagValue(props, PROP_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"));
setTagValue(props, PROP_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"));
}
} }
String description = getDescription(props);
if (description != null)
{
destination.put(ContentModel.PROP_DESCRIPTION, description);
}
}
catch (IOException ioException)
{
// TODO sort out exception handling
throw new RuntimeException("Error reading mp3 file.", ioException);
} }
catch (TagException tagException) finally
{ {
// TODO sort out exception handling tempFile.delete();
throw new RuntimeException("Error reading mp3 tag information.", tagException); }
// Set the destination values
if (props.get(PROP_SONG_TITLE) != null)
{
destination.put(ContentModel.PROP_TITLE, props.get(PROP_SONG_TITLE));
}
if (props.get(PROP_ARTIST) != null)
{
destination.put(ContentModel.PROP_AUTHOR, props.get(PROP_ARTIST));
}
String description = getDescription(props);
if (description != null)
{
destination.put(ContentModel.PROP_DESCRIPTION, description);
} }
} }

View File

@@ -16,7 +16,7 @@
*/ */
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.util.Collections; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@@ -28,7 +28,6 @@ import org.alfresco.error.AlfrescoRuntimeException;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.springframework.util.Assert;
/** /**
* Holds and provides the most appropriate metadate extracter for a particular * Holds and provides the most appropriate metadate extracter for a particular
@@ -52,15 +51,10 @@ public class MetadataExtracterRegistry
/** controls write access to the cache */ /** controls write access to the cache */
private Lock extracterCacheWriteLock; private Lock extracterCacheWriteLock;
/** public MetadataExtracterRegistry()
* @param mimetypeMap all the mimetypes available to the system
*/
public MetadataExtracterRegistry(MimetypeMap mimetypeMap)
{ {
Assert.notNull(mimetypeMap, "The MimetypeMap is mandatory"); // initialise lists
this.mimetypeMap = mimetypeMap; extracters = new ArrayList<MetadataExtracter>(10);
extracters = Collections.emptyList(); // just in case it isn't set
extracterCache = new HashMap<String, MetadataExtracter>(17); extracterCache = new HashMap<String, MetadataExtracter>(17);
// create lock objects for access to the cache // create lock objects for access to the cache
@@ -69,6 +63,40 @@ public class MetadataExtracterRegistry
extracterCacheWriteLock = extractionCacheLock.writeLock(); extracterCacheWriteLock = extractionCacheLock.writeLock();
} }
/**
* The mimetype map that will be used to check requests against
*
* @param mimetypeMap a map of mimetypes
*/
public void setMimetypeMap(MimetypeMap mimetypeMap)
{
this.mimetypeMap = mimetypeMap;
}
/**
* Register an instance of an extracter for use
*
* @param extracter an extracter
*/
public void register(MetadataExtracter extracter)
{
if (logger.isDebugEnabled())
{
logger.debug("Registering metadata extracter: " + extracter);
}
extracterCacheWriteLock.lock();
try
{
extracters.add(extracter);
extracterCache.clear();
}
finally
{
extracterCacheWriteLock.unlock();
}
}
/** /**
* Gets the best metadata extracter. This is a combination of the most * Gets the best metadata extracter. This is a combination of the most
* reliable and the most performant extracter. * reliable and the most performant extracter.
@@ -123,8 +151,8 @@ public class MetadataExtracterRegistry
/** /**
* @param sourceMimetype The MIME type under examination * @param sourceMimetype The MIME type under examination
* @return The fastest of the most reliable extracters in * @return The fastest of the most reliable extracters in <code>extracters</code>
* <code>extracters</code> for the given MIME type. * for the given MIME type, or null if none is available.
*/ */
private MetadataExtracter findBestExtracter(String sourceMimetype) private MetadataExtracter findBestExtracter(String sourceMimetype)
{ {
@@ -137,7 +165,12 @@ public class MetadataExtracterRegistry
for (MetadataExtracter ext : extracters) for (MetadataExtracter ext : extracters)
{ {
double r = ext.getReliability(sourceMimetype); double r = ext.getReliability(sourceMimetype);
if (r == bestReliability) if (r <= 0.0)
{
// extraction not achievable
continue;
}
else if (r == bestReliability)
{ {
long time = ext.getExtractionTime(); long time = ext.getExtractionTime();
if (time < bestTime) if (time < bestTime)
@@ -155,26 +188,4 @@ public class MetadataExtracterRegistry
} }
return bestExtracter; return bestExtracter;
} }
/**
* Provides a list of self-discovering extracters.
*
* @param transformers all the available extracters that the registry can
* work with
*/
public void setExtracters(List<MetadataExtracter> extracters)
{
logger.debug("Setting " + extracters.size() + "new extracters.");
extracterCacheWriteLock.lock();
try
{
this.extracters = extracters;
this.extracterCache.clear();
}
finally
{
extracterCacheWriteLock.unlock();
}
}
} }

View File

@@ -50,7 +50,7 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.0, 1000); super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.0, 1000);
} }
public void extract(ContentReader reader, final Map<QName, Serializable> destination) throws ContentIOException public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
{ {
POIFSReaderListener readerListener = new POIFSReaderListener() POIFSReaderListener readerListener = new POIFSReaderListener()
{ {
@@ -96,12 +96,6 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME); poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
poiFSReader.read(is); poiFSReader.read(is);
} }
catch (IOException e)
{
throw new ContentIOException("Compound Document SummaryInformation metadata extraction failed: \n"
+ " reader: " + reader,
e);
}
finally finally
{ {
if (is != null) if (is != null)

View File

@@ -16,6 +16,8 @@
*/ */
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable; import java.io.Serializable;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
@@ -23,11 +25,8 @@ import java.util.Map;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.catcode.odf.ODFMetaFileAnalyzer; import com.catcode.odf.ODFMetaFileAnalyzer;
import com.catcode.odf.OpenDocumentMetadata; import com.catcode.odf.OpenDocumentMetadata;
@@ -41,8 +40,6 @@ import com.catcode.odf.OpenDocumentMetadata;
*/ */
public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
{ {
private static final Log logger = LogFactory.getLog(OpenDocumentMetadataExtracter.class);
private static String[] mimeTypes = new String[] { private static String[] mimeTypes = new String[] {
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE,
@@ -67,13 +64,15 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 1000); super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 1000);
} }
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{ {
ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer(); ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer();
InputStream is = null;
try try
{ {
is = reader.getContentInputStream();
// stream the document in // stream the document in
OpenDocumentMetadata docInfo = analyzer.analyzeZip(reader.getContentInputStream()); OpenDocumentMetadata docInfo = analyzer.analyzeZip(is);
if (docInfo != null) if (docInfo != null)
{ {
@@ -84,12 +83,12 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate()); destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate());
} }
} }
catch (Throwable e) finally
{ {
String message = "Metadata extraction failed: \n" + if (is != null)
" reader: " + reader; {
logger.debug(message, e); try { is.close(); } catch (IOException e) {}
throw new ContentIOException(message, e); }
} }
} }
} }

View File

@@ -17,17 +17,15 @@
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable; import java.io.Serializable;
import java.util.Calendar; import java.util.Calendar;
import java.util.Map; import java.util.Map;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.pdmodel.PDDocumentInformation;
@@ -37,26 +35,20 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
*/ */
public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
{ {
private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracter.class);
public PdfBoxMetadataExtracter() public PdfBoxMetadataExtracter()
{ {
super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000); super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000);
} }
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
{ {
if (!MimetypeMap.MIMETYPE_PDF.equals(reader.getMimetype()))
{
logger.debug("No metadata extracted for " + reader.getMimetype());
return;
}
PDDocument pdf = null; PDDocument pdf = null;
InputStream is = null;
try try
{ {
is = reader.getContentInputStream();
// stream the document in // stream the document in
pdf = PDDocument.load(reader.getContentInputStream()); pdf = PDDocument.load(is);
// Scoop out the metadata // Scoop out the metadata
PDDocumentInformation docInfo = pdf.getDocumentInformation(); PDDocumentInformation docInfo = pdf.getDocumentInformation();
@@ -68,23 +60,15 @@ public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
if (created != null) if (created != null)
destination.put(ContentModel.PROP_CREATED, created.getTime()); destination.put(ContentModel.PROP_CREATED, created.getTime());
} }
catch (IOException e)
{
throw new ContentIOException("PDF metadata extraction failed: \n" +
" reader: " + reader);
}
finally finally
{ {
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
if (pdf != null) if (pdf != null)
{ {
try try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
{
pdf.close();
}
catch (Throwable e)
{
e.printStackTrace();
}
} }
} }
} }

View File

@@ -1,16 +1,14 @@
package org.alfresco.repo.content.metadata; package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/** /**
* @see org.alfresco.repo.content.transform.PdfBoxContentTransformer * @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
*
* @author Jesper Steen M<>ller * @author Jesper Steen M<>ller
*/ */
public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
{ {
private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracterTest.class);
private MetadataExtracter extracter; private MetadataExtracter extracter;
public void onSetUpInTransaction() throws Exception public void onSetUpInTransaction() throws Exception

View File

@@ -1,58 +0,0 @@
/*
* Copyright (C) 2005 Jesper Steen M<>ller
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.content.metadata;
import java.io.Serializable;
import java.util.Map;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
*
* @author Jesper Steen M<>ller
*/
public class StringMetadataExtracter implements MetadataExtracter
{
public static final String PREFIX_TEXT = "text/";
private static final Log logger = LogFactory.getLog(StringMetadataExtracter.class);
public double getReliability(String sourceMimetype)
{
if (sourceMimetype.startsWith(PREFIX_TEXT))
return 0.1;
else
return 0.0;
}
public long getExtractionTime()
{
return 1000;
}
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
{
if (logger.isDebugEnabled())
{
logger.debug("No metadata extracted for " + reader.getMimetype());
}
}
}

View File

@@ -28,12 +28,9 @@ import net.sf.joott.uno.UnoConnection;
import org.alfresco.model.ContentModel; import org.alfresco.model.ContentModel;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentIOException;
import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.namespace.QName; import org.alfresco.service.namespace.QName;
import org.alfresco.util.TempFileProvider; import org.alfresco.util.TempFileProvider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.sun.star.beans.PropertyValue; import com.sun.star.beans.PropertyValue;
import com.sun.star.beans.XPropertySet; import com.sun.star.beans.XPropertySet;
@@ -49,9 +46,6 @@ import com.sun.star.uno.UnoRuntime;
*/ */
public class UnoMetadataExtracter extends AbstractMetadataExtracter public class UnoMetadataExtracter extends AbstractMetadataExtracter
{ {
private static final Log logger = LogFactory.getLog(UnoMetadataExtracter.class);
private static String[] mimeTypes = new String[] { private static String[] mimeTypes = new String[] {
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER, MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER,
@@ -60,33 +54,44 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter
// quality since they involve conversion. // quality since they involve conversion.
}; };
public UnoMetadataExtracter(MimetypeMap mimetypeMap, String connectionUrl)
{
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 10000);
this.mimetypeMap = mimetypeMap;
init(connectionUrl);
}
public UnoMetadataExtracter(MimetypeMap mimetypeMap)
{
this(mimetypeMap, UnoConnection.DEFAULT_CONNECTION_STRING);
}
private MimetypeMap mimetypeMap; private MimetypeMap mimetypeMap;
private String contentUrl;
private MyUnoConnection connection; private MyUnoConnection connection;
private boolean isConnected; private boolean isConnected;
/** public UnoMetadataExtracter()
* @param unoConnectionUrl the URL of the Uno server
*/
private synchronized void init(String unoConnectionUrl)
{ {
connection = new MyUnoConnection(unoConnectionUrl); super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 10000);
this.contentUrl = UnoConnection.DEFAULT_CONNECTION_STRING;
}
public void setMimetypeMap(MimetypeMap mimetypeMap)
{
this.mimetypeMap = mimetypeMap;
}
/**
*
* @param contentUrl the URL to connect to
*/
public void setContentUrl(String contentUrl)
{
this.contentUrl = contentUrl;
}
/**
* Initialises the bean by establishing an UNO connection
*/
public synchronized void init()
{
connection = new MyUnoConnection(contentUrl);
// attempt to make an connection // attempt to make an connection
try try
{ {
connection.connect(); connection.connect();
isConnected = true; isConnected = true;
// register
super.register();
} }
catch (ConnectException e) catch (ConnectException e)
{ {
@@ -103,66 +108,58 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter
return isConnected; return isConnected;
} }
public void extract(ContentReader reader, final Map<QName, Serializable> destination) throws ContentIOException public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
{ {
String sourceMimetype = reader.getMimetype(); String sourceMimetype = reader.getMimetype();
// create temporary files to convert from and to // create temporary files to convert from and to
File tempFromFile = TempFileProvider.createTempFile("UnoContentTransformer", "." File tempFromFile = TempFileProvider.createTempFile(
"UnoContentTransformer_", "."
+ mimetypeMap.getExtension(sourceMimetype)); + mimetypeMap.getExtension(sourceMimetype));
// download the content from the source reader // download the content from the source reader
reader.getContent(tempFromFile); reader.getContent(tempFromFile);
String sourceUrl = tempFromFile.toString();
try
{
sourceUrl = toUrl(tempFromFile, connection);
// UNO Interprocess Bridge *should* be thread-safe, but... String sourceUrl = toUrl(tempFromFile, connection);
synchronized (connection)
// UNO Interprocess Bridge *should* be thread-safe, but...
synchronized (connection)
{
XComponentLoader desktop = connection.getDesktop();
XComponent document = desktop.loadComponentFromURL(
sourceUrl,
"_blank",
0,
new PropertyValue[] { property("Hidden", Boolean.TRUE) });
if (document == null)
{ {
XComponentLoader desktop = connection.getDesktop(); throw new FileNotFoundException("could not open source document: " + sourceUrl);
XComponent document = desktop.loadComponentFromURL( }
sourceUrl, try
"_blank", {
0, XDocumentInfoSupplier infoSupplier = (XDocumentInfoSupplier) UnoRuntime.queryInterface(
new PropertyValue[] { property("Hidden", Boolean.TRUE) }); XDocumentInfoSupplier.class, document);
if (document == null) XPropertySet propSet = (XPropertySet) UnoRuntime.queryInterface(
{ XPropertySet.class,
throw new FileNotFoundException("could not open source document: " + sourceUrl); infoSupplier
} .getDocumentInfo());
try
{ // Titled aspect
XDocumentInfoSupplier infoSupplier = (XDocumentInfoSupplier) UnoRuntime.queryInterface( trimPut(ContentModel.PROP_TITLE, propSet.getPropertyValue("Title"), destination);
XDocumentInfoSupplier.class, document); trimPut(ContentModel.PROP_DESCRIPTION, propSet.getPropertyValue("Subject"), destination);
XPropertySet propSet = (XPropertySet) UnoRuntime.queryInterface(
XPropertySet.class, // Auditable aspect
infoSupplier // trimPut(ContentModel.PROP_CREATED,
.getDocumentInfo()); // si.getCreateDateTime(), destination);
trimPut(ContentModel.PROP_AUTHOR, propSet.getPropertyValue("Author"), destination);
// Titled aspect // trimPut(ContentModel.PROP_MODIFIED,
trimPut(ContentModel.PROP_TITLE, propSet.getPropertyValue("Title"), destination); // si.getLastSaveDateTime(), destination);
trimPut(ContentModel.PROP_DESCRIPTION, propSet.getPropertyValue("Subject"), destination); // trimPut(ContentModel.PROP_MODIFIER, si.getLastAuthor(),
// destination);
// Auditable aspect }
// trimPut(ContentModel.PROP_CREATED, finally
// si.getCreateDateTime(), destination); {
trimPut(ContentModel.PROP_AUTHOR, propSet.getPropertyValue("Author"), destination); document.dispose();
// trimPut(ContentModel.PROP_MODIFIED,
// si.getLastSaveDateTime(), destination);
// trimPut(ContentModel.PROP_MODIFIER, si.getLastAuthor(),
// destination);
}
finally
{
document.dispose();
}
} }
}
catch (Throwable e)
{
throw new ContentIOException("Conversion failed: \n" +
" source: " + sourceUrl + "\n",
e);
} }
} }

View File

@@ -19,7 +19,6 @@ package org.alfresco.repo.content.metadata;
import org.alfresco.repo.content.MimetypeMap; import org.alfresco.repo.content.MimetypeMap;
/** /**
* @see org.alfresco.repo.content.transform.UnoMetadataExtracter
* @author Jesper Steen M<>ller * @author Jesper Steen M<>ller
*/ */
public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest
@@ -28,7 +27,8 @@ public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest
public void onSetUpInTransaction() throws Exception public void onSetUpInTransaction() throws Exception
{ {
extracter = new UnoMetadataExtracter(mimetypeMap); extracter = new UnoMetadataExtracter();
extracter.setMimetypeMap(mimetypeMap);
} }
/** /**