From 31d9ef768b88e08943921092ce6949d0a001e2d4 Mon Sep 17 00:00:00 2001 From: Derek Hulley Date: Wed, 22 Feb 2006 11:11:53 +0000 Subject: [PATCH] Inverted configuration of Metadata Extracters - Adding an extracter no longer requires modification to the MetadataExtracterRegistry Fixed lack of stream closures git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2465 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261 --- config/alfresco/content-services-context.xml | 46 ++-- .../metadata/AbstractMetadataExtracter.java | 115 +++++++++- .../metadata/HtmlMetadataExtracter.java | 212 +++++++++--------- .../metadata/HtmlMetadataExtracterTest.java | 4 - .../metadata/MP3MetadataExtracter.java | 154 ++++++------- .../metadata/MetadataExtracterRegistry.java | 81 ++++--- .../metadata/OfficeMetadataExtracter.java | 8 +- .../OpenDocumentMetadataExtracter.java | 23 +- .../metadata/PdfBoxMetadataExtracter.java | 36 +-- .../metadata/PdfBoxMetadataExtracterTest.java | 6 +- .../metadata/StringMetadataExtracter.java | 58 ----- .../metadata/UnoMetadataExtracter.java | 143 ++++++------ .../metadata/UnoMetadataExtracterTest.java | 4 +- 13 files changed, 445 insertions(+), 445 deletions(-) delete mode 100644 source/java/org/alfresco/repo/content/metadata/StringMetadataExtracter.java diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 7a97118f04..7fd5b8840f 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -87,27 +87,35 @@ + - + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + @@ -116,9 +124,7 @@ - + mimetypes; + private static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class); + + private MetadataExtracterRegistry registry; + private Set supportedMimetypes; private double reliability; private long extractionTime; - protected AbstractMetadataExtracter(String mimetype, double reliability, long extractionTime) + protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime) { - this.mimetypes = Collections.singleton(mimetype); + this.supportedMimetypes = Collections.singleton(supportedMimetype); this.reliability = reliability; this.extractionTime = extractionTime; } - protected AbstractMetadataExtracter(Set mimetypes, double reliability, long extractionTime) + protected AbstractMetadataExtracter(Set supportedMimetypes, double reliability, long extractionTime) { - this.mimetypes = mimetypes; + this.supportedMimetypes = supportedMimetypes; this.reliability = reliability; this.extractionTime = extractionTime; } - public double getReliability(String sourceMimetype) + /** + * Set the registry to register with + * + * @param registry a metadata extracter registry + */ + public void setRegistry(MetadataExtracterRegistry registry) { - if (mimetypes.contains(sourceMimetype)) + this.registry = registry; + } + + /** + * Registers this instance of the extracter with the registry. + * + * @see #setRegistry(MetadataExtracterRegistry) + */ + public void register() + { + if (registry == null) + { + throw new IllegalArgumentException("Property 'registry' has not been set"); + } + registry.register(this); + } + + /** + * Default reliability check that returns the reliability as configured by the contstructor + * if the mimetype is in the list of supported mimetypes. + * + * @param mimetype the mimetype to check + */ + public double getReliability(String mimetype) + { + if (supportedMimetypes.contains(mimetype)) return reliability; else return 0.0; @@ -60,7 +97,69 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter { return extractionTime; } + + /** + * Checks if the mimetype is supported. + * + * @param reader the reader to check + * @throws AlfrescoRuntimeException if the mimetype is not supported + */ + protected void checkReliability(ContentReader reader) + { + String mimetype = reader.getMimetype(); + if (getReliability(mimetype) <= 0.0) + { + throw new AlfrescoRuntimeException( + "Metadata extracter does not support mimetype: \n" + + " reader: " + reader + "\n" + + " supported: " + supportedMimetypes + "\n" + + " extracter: " + this); + } + } + public final void extract(ContentReader reader, Map destination) throws ContentIOException + { + // check the reliability + checkReliability(reader); + + try + { + extractInternal(reader, destination); + } + catch (Throwable e) + { + throw new ContentIOException("Metadata extraction failed: \n" + + " reader: " + reader + "\n" + + e); + } + finally + { + // check that the reader and writer are both closed + if (!reader.isClosed()) + { + logger.error("Content reader not closed by metadata extracter: \n" + reader); + } + } + + // done + if (logger.isDebugEnabled()) + { + logger.debug("Completed metadata extraction: \n" + + " reader: " + reader + "\n" + + " extracter: " + this); + } + } + + /** + * Override to provide the necessary extraction logic. Implementations must ensure that the reader + * is closed before the method exits. + * + * @param reader the source of the content + * @param destination the property map to fill + * @throws Throwable an exception + */ + protected abstract void extractInternal(ContentReader reader, Map destination) throws Throwable; + /** * Examines a value or string for nulls and adds it to the map (if * non-empty) diff --git a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java index 84021d805f..63b731e3c2 100644 --- a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracter.java @@ -16,7 +16,6 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; @@ -34,7 +33,6 @@ import javax.swing.text.html.parser.ParserDelegator; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; @@ -56,122 +54,116 @@ public class HtmlMetadataExtracter extends AbstractMetadataExtracter super(MIMETYPES, 1.0, 1000); } - public void extract(ContentReader reader, Map destination) throws ContentIOException + public void extractInternal(ContentReader reader, Map destination) throws Throwable { final Map tempDestination = new HashMap(); - try + + HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() { - HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() + StringBuffer title = null; + boolean inHead = false; + + public void handleText(char[] data, int pos) { - StringBuffer title = null; - boolean inHead = false; - - public void handleText(char[] data, int pos) + if (title != null) { - if (title != null) - { - title.append(data); - } - } - - public void handleComment(char[] data, int pos) - { - // Perhaps sniff for Office 9+ metadata in here? - } - - public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) - { - if (HTML.Tag.HEAD.equals(t)) - { - inHead = true; - } - else if (HTML.Tag.TITLE.equals(t) && inHead) - { - title = new StringBuffer(); - } - else - handleSimpleTag(t, a, pos); - } - - public void handleEndTag(HTML.Tag t, int pos) - { - if (HTML.Tag.HEAD.equals(t)) - { - inHead = false; - } - else if (HTML.Tag.TITLE.equals(t) && title != null) - { - trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination); - title = null; - } - } - - public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) - { - if (HTML.Tag.META.equals(t)) - { - Object nameO = a.getAttribute(HTML.Attribute.NAME); - Object valueO = a.getAttribute(HTML.Attribute.CONTENT); - if (nameO == null || valueO == null) - return; - - String name = nameO.toString(); - - if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author") - || name.equalsIgnoreCase("dc.creator")) - { - trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination); - } - if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description")) - { - trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination); - } - } - } - - public void handleError(String errorMsg, int pos) - { - } - }; - - String charsetGuess = "UTF-8"; - int tries = 0; - while (tries < 3) - { - tempDestination.clear(); - Reader r = null; - InputStream cis = null; - try - { - cis = reader.getContentInputStream(); - // TODO: for now, use default charset; we should attempt to map from html meta-data - r = new InputStreamReader(cis); - HTMLEditorKit.Parser parser = new ParserDelegator(); - parser.parse(r, callback, tries > 0); - destination.putAll(tempDestination); - break; - } - catch (ChangedCharSetException ccse) - { - tries++; - charsetGuess = ccse.getCharSetSpec(); - int begin = charsetGuess.indexOf("charset="); - if (begin > 0) - charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length()); - reader = reader.getReader(); - } - finally - { - if (r != null) - r.close(); - if (cis != null) - cis.close(); + title.append(data); } } - } - catch (IOException e) + + public void handleComment(char[] data, int pos) + { + // Perhaps sniff for Office 9+ metadata in here? + } + + public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) + { + if (HTML.Tag.HEAD.equals(t)) + { + inHead = true; + } + else if (HTML.Tag.TITLE.equals(t) && inHead) + { + title = new StringBuffer(); + } + else + handleSimpleTag(t, a, pos); + } + + public void handleEndTag(HTML.Tag t, int pos) + { + if (HTML.Tag.HEAD.equals(t)) + { + inHead = false; + } + else if (HTML.Tag.TITLE.equals(t) && title != null) + { + trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination); + title = null; + } + } + + public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) + { + if (HTML.Tag.META.equals(t)) + { + Object nameO = a.getAttribute(HTML.Attribute.NAME); + Object valueO = a.getAttribute(HTML.Attribute.CONTENT); + if (nameO == null || valueO == null) + return; + + String name = nameO.toString(); + + if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author") + || name.equalsIgnoreCase("dc.creator")) + { + trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination); + } + if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description")) + { + trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination); + } + } + } + + public void handleError(String errorMsg, int pos) + { + } + }; + + String charsetGuess = "UTF-8"; + int tries = 0; + while (tries < 3) { - throw new ContentIOException("HTML metadata extraction failed: \n" + " reader: " + reader, e); + tempDestination.clear(); + Reader r = null; + InputStream cis = null; + try + { + cis = reader.getContentInputStream(); + // TODO: for now, use default charset; we should attempt to map from html meta-data + r = new InputStreamReader(cis); + HTMLEditorKit.Parser parser = new ParserDelegator(); + parser.parse(r, callback, tries > 0); + destination.putAll(tempDestination); + break; + } + catch (ChangedCharSetException ccse) + { + tries++; + charsetGuess = ccse.getCharSetSpec(); + int begin = charsetGuess.indexOf("charset="); + if (begin > 0) + charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length()); + reader = reader.getReader(); + } + finally + { + if (r != null) + r.close(); + if (cis != null) + cis.close(); + } } } } diff --git a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java index 49acfb2026..39f627d3b0 100644 --- a/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/HtmlMetadataExtracterTest.java @@ -17,16 +17,12 @@ package org.alfresco.repo.content.metadata; import org.alfresco.repo.content.MimetypeMap; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; /** - * @see org.alfresco.repo.content.transform.OfficeMetadataExtracter * @author Jesper Steen Møller */ public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest { - private static final Log logger = LogFactory.getLog(HtmlMetadataExtracterTest.class); private MetadataExtracter extracter; public void onSetUpInTransaction() throws Exception diff --git a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java index e0ced6bb33..e24e6bd19c 100644 --- a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java @@ -17,20 +17,17 @@ package org.alfresco.repo.content.metadata; import java.io.File; -import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.Map; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; -import org.alfresco.util.GUID; +import org.alfresco.util.TempFileProvider; import org.farng.mp3.AbstractMP3FragmentBody; import org.farng.mp3.MP3File; -import org.farng.mp3.TagException; import org.farng.mp3.id3.AbstractID3v2; import org.farng.mp3.id3.AbstractID3v2Frame; import org.farng.mp3.id3.ID3v1; @@ -58,103 +55,88 @@ public class MP3MetadataExtracter extends AbstractMetadataExtracter super(MimetypeMap.MIMETYPE_MP3, 1.0, 1000); } - /** - * @see org.alfresco.repo.content.metadata.MetadataExtracter#extract(org.alfresco.service.cmr.repository.ContentReader, java.util.Map) - */ - public void extract(ContentReader reader, - Map destination) throws ContentIOException + public void extractInternal( + ContentReader reader, + Map destination) throws Throwable { + Map props = new HashMap(); + + // Create a temp file + File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp"); try { - Map props = new HashMap(); + reader.getContent(tempFile); - // Create a temp file - File tempFile = File.createTempFile(GUID.generate(), ".tmp"); - try + // Create the MP3 object from the file + MP3File mp3File = new MP3File(tempFile); + + ID3v1 id3v1 = mp3File.getID3v1Tag(); + if (id3v1 != null) { - reader.getContent(tempFile); + setTagValue(props, PROP_ALBUM_TITLE, id3v1.getAlbum()); + setTagValue(props, PROP_SONG_TITLE, id3v1.getTitle()); + setTagValue(props, PROP_ARTIST, id3v1.getArtist()); + setTagValue(props, PROP_COMMENT, id3v1.getComment()); + setTagValue(props, PROP_YEAR_RELEASED, id3v1.getYear()); - // Create the MP3 object from the file - MP3File mp3File = new MP3File(tempFile); + // TODO sort out the genre + //setTagValue(props, MusicModel.PROP_GENRE, id3v1.getGenre()); - ID3v1 id3v1 = mp3File.getID3v1Tag(); - if (id3v1 != null) - { - setTagValue(props, PROP_ALBUM_TITLE, id3v1.getAlbum()); - setTagValue(props, PROP_SONG_TITLE, id3v1.getTitle()); - setTagValue(props, PROP_ARTIST, id3v1.getArtist()); - setTagValue(props, PROP_COMMENT, id3v1.getComment()); - setTagValue(props, PROP_YEAR_RELEASED, id3v1.getYear()); - - // TODO sort out the genre - //setTagValue(props, MusicModel.PROP_GENRE, id3v1.getGenre()); - - // TODO sort out the size - //setTagValue(props, MusicModel.PROP_SIZE, id3v1.getSize()); - } - - AbstractID3v2 id3v2 = mp3File.getID3v2Tag(); - if (id3v2 != null) - { - setTagValue(props, PROP_SONG_TITLE, getID3V2Value(id3v2, "TIT2")); - setTagValue(props, PROP_ARTIST, getID3V2Value(id3v2, "TPE1")); - setTagValue(props, PROP_ALBUM_TITLE, getID3V2Value(id3v2, "TALB")); - setTagValue(props, PROP_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC")); - setTagValue(props, PROP_COMMENT, getID3V2Value(id3v2, "COMM")); - setTagValue(props, PROP_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK")); - setTagValue(props, PROP_GENRE, getID3V2Value(id3v2, "TCON")); - setTagValue(props, PROP_COMPOSER, getID3V2Value(id3v2, "TCOM")); - - // TODO sort out the lyrics - //System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT")); - //System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT")); - } - - AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag(); - if (lyrics3Tag != null) - { - System.out.println("Lyrics3 tag found."); - if (lyrics3Tag instanceof Lyrics3v2) - { - setTagValue(props, PROP_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2")); - setTagValue(props, PROP_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1")); - setTagValue(props, PROP_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB")); - setTagValue(props, PROP_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM")); - setTagValue(props, PROP_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT")); - setTagValue(props, PROP_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM")); - } - } - - } - finally - { - tempFile.delete(); + // TODO sort out the size + //setTagValue(props, MusicModel.PROP_SIZE, id3v1.getSize()); } - // Set the destination values - if (props.get(PROP_SONG_TITLE) != null) + AbstractID3v2 id3v2 = mp3File.getID3v2Tag(); + if (id3v2 != null) { - destination.put(ContentModel.PROP_TITLE, props.get(PROP_SONG_TITLE)); + setTagValue(props, PROP_SONG_TITLE, getID3V2Value(id3v2, "TIT2")); + setTagValue(props, PROP_ARTIST, getID3V2Value(id3v2, "TPE1")); + setTagValue(props, PROP_ALBUM_TITLE, getID3V2Value(id3v2, "TALB")); + setTagValue(props, PROP_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC")); + setTagValue(props, PROP_COMMENT, getID3V2Value(id3v2, "COMM")); + setTagValue(props, PROP_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK")); + setTagValue(props, PROP_GENRE, getID3V2Value(id3v2, "TCON")); + setTagValue(props, PROP_COMPOSER, getID3V2Value(id3v2, "TCOM")); + + // TODO sort out the lyrics + //System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT")); + //System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT")); } - if (props.get(PROP_ARTIST) != null) + + AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag(); + if (lyrics3Tag != null) { - destination.put(ContentModel.PROP_AUTHOR, props.get(PROP_ARTIST)); + System.out.println("Lyrics3 tag found."); + if (lyrics3Tag instanceof Lyrics3v2) + { + setTagValue(props, PROP_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2")); + setTagValue(props, PROP_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1")); + setTagValue(props, PROP_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB")); + setTagValue(props, PROP_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM")); + setTagValue(props, PROP_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT")); + setTagValue(props, PROP_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM")); + } } - String description = getDescription(props); - if (description != null) - { - destination.put(ContentModel.PROP_DESCRIPTION, description); - } - } - catch (IOException ioException) - { - // TODO sort out exception handling - throw new RuntimeException("Error reading mp3 file.", ioException); + } - catch (TagException tagException) + finally { - // TODO sort out exception handling - throw new RuntimeException("Error reading mp3 tag information.", tagException); + tempFile.delete(); + } + + // Set the destination values + if (props.get(PROP_SONG_TITLE) != null) + { + destination.put(ContentModel.PROP_TITLE, props.get(PROP_SONG_TITLE)); + } + if (props.get(PROP_ARTIST) != null) + { + destination.put(ContentModel.PROP_AUTHOR, props.get(PROP_ARTIST)); + } + String description = getDescription(props); + if (description != null) + { + destination.put(ContentModel.PROP_DESCRIPTION, description); } } diff --git a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java index 53940a390a..0a3fd4fe1a 100644 --- a/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java +++ b/source/java/org/alfresco/repo/content/metadata/MetadataExtracterRegistry.java @@ -16,7 +16,7 @@ */ package org.alfresco.repo.content.metadata; -import java.util.Collections; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -28,7 +28,6 @@ import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.repo.content.MimetypeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.springframework.util.Assert; /** * Holds and provides the most appropriate metadate extracter for a particular @@ -52,15 +51,10 @@ public class MetadataExtracterRegistry /** controls write access to the cache */ private Lock extracterCacheWriteLock; - /** - * @param mimetypeMap all the mimetypes available to the system - */ - public MetadataExtracterRegistry(MimetypeMap mimetypeMap) + public MetadataExtracterRegistry() { - Assert.notNull(mimetypeMap, "The MimetypeMap is mandatory"); - this.mimetypeMap = mimetypeMap; - - extracters = Collections.emptyList(); // just in case it isn't set + // initialise lists + extracters = new ArrayList(10); extracterCache = new HashMap(17); // create lock objects for access to the cache @@ -69,6 +63,40 @@ public class MetadataExtracterRegistry extracterCacheWriteLock = extractionCacheLock.writeLock(); } + /** + * The mimetype map that will be used to check requests against + * + * @param mimetypeMap a map of mimetypes + */ + public void setMimetypeMap(MimetypeMap mimetypeMap) + { + this.mimetypeMap = mimetypeMap; + } + + /** + * Register an instance of an extracter for use + * + * @param extracter an extracter + */ + public void register(MetadataExtracter extracter) + { + if (logger.isDebugEnabled()) + { + logger.debug("Registering metadata extracter: " + extracter); + } + + extracterCacheWriteLock.lock(); + try + { + extracters.add(extracter); + extracterCache.clear(); + } + finally + { + extracterCacheWriteLock.unlock(); + } + } + /** * Gets the best metadata extracter. This is a combination of the most * reliable and the most performant extracter. @@ -123,8 +151,8 @@ public class MetadataExtracterRegistry /** * @param sourceMimetype The MIME type under examination - * @return The fastest of the most reliable extracters in - * extracters for the given MIME type. + * @return The fastest of the most reliable extracters in extracters + * for the given MIME type, or null if none is available. */ private MetadataExtracter findBestExtracter(String sourceMimetype) { @@ -137,7 +165,12 @@ public class MetadataExtracterRegistry for (MetadataExtracter ext : extracters) { double r = ext.getReliability(sourceMimetype); - if (r == bestReliability) + if (r <= 0.0) + { + // extraction not achievable + continue; + } + else if (r == bestReliability) { long time = ext.getExtractionTime(); if (time < bestTime) @@ -155,26 +188,4 @@ public class MetadataExtracterRegistry } return bestExtracter; } - - /** - * Provides a list of self-discovering extracters. - * - * @param transformers all the available extracters that the registry can - * work with - */ - public void setExtracters(List extracters) - { - logger.debug("Setting " + extracters.size() + "new extracters."); - - extracterCacheWriteLock.lock(); - try - { - this.extracters = extracters; - this.extracterCache.clear(); - } - finally - { - extracterCacheWriteLock.unlock(); - } - } } \ No newline at end of file diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java index 1c5b234ceb..9f0917d50e 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java @@ -50,7 +50,7 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter super(new HashSet(Arrays.asList(mimeTypes)), 1.0, 1000); } - public void extract(ContentReader reader, final Map destination) throws ContentIOException + public void extractInternal(ContentReader reader, final Map destination) throws Throwable { POIFSReaderListener readerListener = new POIFSReaderListener() { @@ -96,12 +96,6 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME); poiFSReader.read(is); } - catch (IOException e) - { - throw new ContentIOException("Compound Document SummaryInformation metadata extraction failed: \n" - + " reader: " + reader, - e); - } finally { if (is != null) diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java index 0364ad9b55..302518f98e 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java @@ -16,6 +16,8 @@ */ package org.alfresco.repo.content.metadata; +import java.io.IOException; +import java.io.InputStream; import java.io.Serializable; import java.util.Arrays; import java.util.HashSet; @@ -23,11 +25,8 @@ import java.util.Map; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import com.catcode.odf.ODFMetaFileAnalyzer; import com.catcode.odf.OpenDocumentMetadata; @@ -41,8 +40,6 @@ import com.catcode.odf.OpenDocumentMetadata; */ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter { - private static final Log logger = LogFactory.getLog(OpenDocumentMetadataExtracter.class); - private static String[] mimeTypes = new String[] { MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, @@ -67,13 +64,15 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter super(new HashSet(Arrays.asList(mimeTypes)), 1.00, 1000); } - public void extract(ContentReader reader, Map destination) throws ContentIOException + public void extractInternal(ContentReader reader, Map destination) throws Throwable { ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer(); + InputStream is = null; try { + is = reader.getContentInputStream(); // stream the document in - OpenDocumentMetadata docInfo = analyzer.analyzeZip(reader.getContentInputStream()); + OpenDocumentMetadata docInfo = analyzer.analyzeZip(is); if (docInfo != null) { @@ -84,12 +83,12 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate()); } } - catch (Throwable e) + finally { - String message = "Metadata extraction failed: \n" + - " reader: " + reader; - logger.debug(message, e); - throw new ContentIOException(message, e); + if (is != null) + { + try { is.close(); } catch (IOException e) {} + } } } } \ No newline at end of file diff --git a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java index c34f0b81c3..e335c6cf83 100644 --- a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java @@ -17,17 +17,15 @@ package org.alfresco.repo.content.metadata; import java.io.IOException; +import java.io.InputStream; import java.io.Serializable; import java.util.Calendar; import java.util.Map; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; @@ -37,26 +35,20 @@ import org.pdfbox.pdmodel.PDDocumentInformation; */ public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter { - - private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracter.class); - public PdfBoxMetadataExtracter() { super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000); } - public void extract(ContentReader reader, Map destination) throws ContentIOException + public void extractInternal(ContentReader reader, Map destination) throws Throwable { - if (!MimetypeMap.MIMETYPE_PDF.equals(reader.getMimetype())) - { - logger.debug("No metadata extracted for " + reader.getMimetype()); - return; - } PDDocument pdf = null; + InputStream is = null; try { + is = reader.getContentInputStream(); // stream the document in - pdf = PDDocument.load(reader.getContentInputStream()); + pdf = PDDocument.load(is); // Scoop out the metadata PDDocumentInformation docInfo = pdf.getDocumentInformation(); @@ -68,23 +60,15 @@ public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter if (created != null) destination.put(ContentModel.PROP_CREATED, created.getTime()); } - catch (IOException e) - { - throw new ContentIOException("PDF metadata extraction failed: \n" + - " reader: " + reader); - } finally { + if (is != null) + { + try { is.close(); } catch (IOException e) {} + } if (pdf != null) { - try - { - pdf.close(); - } - catch (Throwable e) - { - e.printStackTrace(); - } + try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); } } } } diff --git a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java index f218508d22..ddb3dd91cf 100644 --- a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracterTest.java @@ -1,16 +1,14 @@ package org.alfresco.repo.content.metadata; import org.alfresco.repo.content.MimetypeMap; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; /** - * @see org.alfresco.repo.content.transform.PdfBoxContentTransformer + * @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter + * * @author Jesper Steen Møller */ public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest { - private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracterTest.class); private MetadataExtracter extracter; public void onSetUpInTransaction() throws Exception diff --git a/source/java/org/alfresco/repo/content/metadata/StringMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/StringMetadataExtracter.java deleted file mode 100644 index 29cba14764..0000000000 --- a/source/java/org/alfresco/repo/content/metadata/StringMetadataExtracter.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2005 Jesper Steen Møller - * - * Licensed under the Mozilla Public License version 1.1 - * with a permitted attribution clause. You may obtain a - * copy of the License at - * - * http://www.alfresco.org/legal/license.txt - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, - * either express or implied. See the License for the specific - * language governing permissions and limitations under the - * License. - */ -package org.alfresco.repo.content.metadata; - -import java.io.Serializable; -import java.util.Map; - -import org.alfresco.service.cmr.repository.ContentIOException; -import org.alfresco.service.cmr.repository.ContentReader; -import org.alfresco.service.namespace.QName; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * - * @author Jesper Steen Møller - */ -public class StringMetadataExtracter implements MetadataExtracter -{ - public static final String PREFIX_TEXT = "text/"; - - private static final Log logger = LogFactory.getLog(StringMetadataExtracter.class); - - public double getReliability(String sourceMimetype) - { - if (sourceMimetype.startsWith(PREFIX_TEXT)) - return 0.1; - else - return 0.0; - } - - public long getExtractionTime() - { - return 1000; - } - - public void extract(ContentReader reader, Map destination) throws ContentIOException - { - if (logger.isDebugEnabled()) - { - logger.debug("No metadata extracted for " + reader.getMimetype()); - } - } -} diff --git a/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracter.java index 4bdc69ddd0..5364f668b2 100644 --- a/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracter.java @@ -28,12 +28,9 @@ import net.sf.joott.uno.UnoConnection; import org.alfresco.model.ContentModel; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentIOException; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; import org.alfresco.util.TempFileProvider; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import com.sun.star.beans.PropertyValue; import com.sun.star.beans.XPropertySet; @@ -49,9 +46,6 @@ import com.sun.star.uno.UnoRuntime; */ public class UnoMetadataExtracter extends AbstractMetadataExtracter { - - private static final Log logger = LogFactory.getLog(UnoMetadataExtracter.class); - private static String[] mimeTypes = new String[] { MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT, MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER, @@ -60,33 +54,44 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter // quality since they involve conversion. }; - public UnoMetadataExtracter(MimetypeMap mimetypeMap, String connectionUrl) - { - super(new HashSet(Arrays.asList(mimeTypes)), 1.00, 10000); - this.mimetypeMap = mimetypeMap; - init(connectionUrl); - } - - public UnoMetadataExtracter(MimetypeMap mimetypeMap) - { - this(mimetypeMap, UnoConnection.DEFAULT_CONNECTION_STRING); - } - private MimetypeMap mimetypeMap; + private String contentUrl; private MyUnoConnection connection; private boolean isConnected; - /** - * @param unoConnectionUrl the URL of the Uno server - */ - private synchronized void init(String unoConnectionUrl) + public UnoMetadataExtracter() { - connection = new MyUnoConnection(unoConnectionUrl); + super(new HashSet(Arrays.asList(mimeTypes)), 1.00, 10000); + this.contentUrl = UnoConnection.DEFAULT_CONNECTION_STRING; + } + + public void setMimetypeMap(MimetypeMap mimetypeMap) + { + this.mimetypeMap = mimetypeMap; + } + + /** + * + * @param contentUrl the URL to connect to + */ + public void setContentUrl(String contentUrl) + { + this.contentUrl = contentUrl; + } + + /** + * Initialises the bean by establishing an UNO connection + */ + public synchronized void init() + { + connection = new MyUnoConnection(contentUrl); // attempt to make an connection try { connection.connect(); isConnected = true; + // register + super.register(); } catch (ConnectException e) { @@ -103,66 +108,58 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter return isConnected; } - public void extract(ContentReader reader, final Map destination) throws ContentIOException + public void extractInternal(ContentReader reader, final Map destination) throws Throwable { String sourceMimetype = reader.getMimetype(); // create temporary files to convert from and to - File tempFromFile = TempFileProvider.createTempFile("UnoContentTransformer", "." + File tempFromFile = TempFileProvider.createTempFile( + "UnoContentTransformer_", "." + mimetypeMap.getExtension(sourceMimetype)); // download the content from the source reader reader.getContent(tempFromFile); - String sourceUrl = tempFromFile.toString(); - try - { - sourceUrl = toUrl(tempFromFile, connection); - // UNO Interprocess Bridge *should* be thread-safe, but... - synchronized (connection) + String sourceUrl = toUrl(tempFromFile, connection); + + // UNO Interprocess Bridge *should* be thread-safe, but... + synchronized (connection) + { + XComponentLoader desktop = connection.getDesktop(); + XComponent document = desktop.loadComponentFromURL( + sourceUrl, + "_blank", + 0, + new PropertyValue[] { property("Hidden", Boolean.TRUE) }); + if (document == null) { - XComponentLoader desktop = connection.getDesktop(); - XComponent document = desktop.loadComponentFromURL( - sourceUrl, - "_blank", - 0, - new PropertyValue[] { property("Hidden", Boolean.TRUE) }); - if (document == null) - { - throw new FileNotFoundException("could not open source document: " + sourceUrl); - } - try - { - XDocumentInfoSupplier infoSupplier = (XDocumentInfoSupplier) UnoRuntime.queryInterface( - XDocumentInfoSupplier.class, document); - XPropertySet propSet = (XPropertySet) UnoRuntime.queryInterface( - XPropertySet.class, - infoSupplier - .getDocumentInfo()); - - // Titled aspect - trimPut(ContentModel.PROP_TITLE, propSet.getPropertyValue("Title"), destination); - trimPut(ContentModel.PROP_DESCRIPTION, propSet.getPropertyValue("Subject"), destination); - - // Auditable aspect - // trimPut(ContentModel.PROP_CREATED, - // si.getCreateDateTime(), destination); - trimPut(ContentModel.PROP_AUTHOR, propSet.getPropertyValue("Author"), destination); - // trimPut(ContentModel.PROP_MODIFIED, - // si.getLastSaveDateTime(), destination); - // trimPut(ContentModel.PROP_MODIFIER, si.getLastAuthor(), - // destination); - } - finally - { - document.dispose(); - } + throw new FileNotFoundException("could not open source document: " + sourceUrl); + } + try + { + XDocumentInfoSupplier infoSupplier = (XDocumentInfoSupplier) UnoRuntime.queryInterface( + XDocumentInfoSupplier.class, document); + XPropertySet propSet = (XPropertySet) UnoRuntime.queryInterface( + XPropertySet.class, + infoSupplier + .getDocumentInfo()); + + // Titled aspect + trimPut(ContentModel.PROP_TITLE, propSet.getPropertyValue("Title"), destination); + trimPut(ContentModel.PROP_DESCRIPTION, propSet.getPropertyValue("Subject"), destination); + + // Auditable aspect + // trimPut(ContentModel.PROP_CREATED, + // si.getCreateDateTime(), destination); + trimPut(ContentModel.PROP_AUTHOR, propSet.getPropertyValue("Author"), destination); + // trimPut(ContentModel.PROP_MODIFIED, + // si.getLastSaveDateTime(), destination); + // trimPut(ContentModel.PROP_MODIFIER, si.getLastAuthor(), + // destination); + } + finally + { + document.dispose(); } - } - catch (Throwable e) - { - throw new ContentIOException("Conversion failed: \n" + - " source: " + sourceUrl + "\n", - e); } } diff --git a/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracterTest.java index 3676728cc9..474d9f7700 100644 --- a/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/UnoMetadataExtracterTest.java @@ -19,7 +19,6 @@ package org.alfresco.repo.content.metadata; import org.alfresco.repo.content.MimetypeMap; /** - * @see org.alfresco.repo.content.transform.UnoMetadataExtracter * @author Jesper Steen Møller */ public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest @@ -28,7 +27,8 @@ public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest public void onSetUpInTransaction() throws Exception { - extracter = new UnoMetadataExtracter(mimetypeMap); + extracter = new UnoMetadataExtracter(); + extracter.setMimetypeMap(mimetypeMap); } /**