mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-24 17:32:48 +00:00
Inverted configuration of Metadata Extracters
- Adding an extracter no longer requires modification to the MetadataExtracterRegistry Fixed lack of stream closures git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2465 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -87,28 +87,36 @@
|
||||
</constructor-arg>
|
||||
</bean>
|
||||
|
||||
<!-- Metadata Extraction Regisitry -->
|
||||
<bean id="metadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" >
|
||||
<constructor-arg>
|
||||
<property name="mimetypeMap">
|
||||
<ref bean="mimetypeService" />
|
||||
</constructor-arg>
|
||||
<!-- metadata extracters -->
|
||||
<property name="extracters">
|
||||
<list>
|
||||
<bean class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.StringMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.UnoMetadataExtracter" >
|
||||
<constructor-arg>
|
||||
<ref bean="mimetypeService" />
|
||||
</constructor-arg>
|
||||
</bean>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- Abstract bean definition defining base definition for all metadata extracters -->
|
||||
<bean id="baseMetadataExtracter"
|
||||
class="org.alfresco.repo.content.metadata.AbstractMetadataExtracter"
|
||||
abstract="true"
|
||||
init-method="register">
|
||||
<property name="registry">
|
||||
<ref bean="metadataExtracterRegistry" />
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
<!-- Content Metadata Extracters -->
|
||||
<bean class="org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.OfficeMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.HtmlMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.MP3MetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.OpenDocumentMetadataExtracter" parent="baseMetadataExtracter" />
|
||||
<bean class="org.alfresco.repo.content.metadata.UnoMetadataExtracter" parent="baseMetadataExtracter" init-method="init" >
|
||||
<property name="mimetypeMap">
|
||||
<ref bean="mimetypeService" />
|
||||
</property>
|
||||
</bean>
|
||||
|
||||
|
||||
<!-- Content Transformation Regisitry -->
|
||||
<bean id="contentTransformerRegistry" class="org.alfresco.repo.content.transform.ContentTransformerRegistry" >
|
||||
<constructor-arg>
|
||||
@@ -116,9 +124,7 @@
|
||||
</constructor-arg>
|
||||
</bean>
|
||||
|
||||
<!--
|
||||
Abstract bean definition defining base definition for all transformers
|
||||
-->
|
||||
<!-- Abstract bean definition defining base definition for all transformers -->
|
||||
<bean id="baseContentTransformer"
|
||||
class="org.alfresco.repo.content.transform.AbstractContentTransformer"
|
||||
abstract="true"
|
||||
|
@@ -21,7 +21,12 @@ import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
@@ -29,28 +34,60 @@ import org.alfresco.service.namespace.QName;
|
||||
*/
|
||||
abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
{
|
||||
private static Log logger = LogFactory.getLog(AbstractMetadataExtracter.class);
|
||||
|
||||
private Set<String> mimetypes;
|
||||
private MetadataExtracterRegistry registry;
|
||||
private Set<String> supportedMimetypes;
|
||||
private double reliability;
|
||||
private long extractionTime;
|
||||
|
||||
protected AbstractMetadataExtracter(String mimetype, double reliability, long extractionTime)
|
||||
protected AbstractMetadataExtracter(String supportedMimetype, double reliability, long extractionTime)
|
||||
{
|
||||
this.mimetypes = Collections.singleton(mimetype);
|
||||
this.supportedMimetypes = Collections.singleton(supportedMimetype);
|
||||
this.reliability = reliability;
|
||||
this.extractionTime = extractionTime;
|
||||
}
|
||||
|
||||
protected AbstractMetadataExtracter(Set<String> mimetypes, double reliability, long extractionTime)
|
||||
protected AbstractMetadataExtracter(Set<String> supportedMimetypes, double reliability, long extractionTime)
|
||||
{
|
||||
this.mimetypes = mimetypes;
|
||||
this.supportedMimetypes = supportedMimetypes;
|
||||
this.reliability = reliability;
|
||||
this.extractionTime = extractionTime;
|
||||
}
|
||||
|
||||
public double getReliability(String sourceMimetype)
|
||||
/**
|
||||
* Set the registry to register with
|
||||
*
|
||||
* @param registry a metadata extracter registry
|
||||
*/
|
||||
public void setRegistry(MetadataExtracterRegistry registry)
|
||||
{
|
||||
if (mimetypes.contains(sourceMimetype))
|
||||
this.registry = registry;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers this instance of the extracter with the registry.
|
||||
*
|
||||
* @see #setRegistry(MetadataExtracterRegistry)
|
||||
*/
|
||||
public void register()
|
||||
{
|
||||
if (registry == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Property 'registry' has not been set");
|
||||
}
|
||||
registry.register(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Default reliability check that returns the reliability as configured by the contstructor
|
||||
* if the mimetype is in the list of supported mimetypes.
|
||||
*
|
||||
* @param mimetype the mimetype to check
|
||||
*/
|
||||
public double getReliability(String mimetype)
|
||||
{
|
||||
if (supportedMimetypes.contains(mimetype))
|
||||
return reliability;
|
||||
else
|
||||
return 0.0;
|
||||
@@ -61,6 +98,68 @@ abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
return extractionTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the mimetype is supported.
|
||||
*
|
||||
* @param reader the reader to check
|
||||
* @throws AlfrescoRuntimeException if the mimetype is not supported
|
||||
*/
|
||||
protected void checkReliability(ContentReader reader)
|
||||
{
|
||||
String mimetype = reader.getMimetype();
|
||||
if (getReliability(mimetype) <= 0.0)
|
||||
{
|
||||
throw new AlfrescoRuntimeException(
|
||||
"Metadata extracter does not support mimetype: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
" supported: " + supportedMimetypes + "\n" +
|
||||
" extracter: " + this);
|
||||
}
|
||||
}
|
||||
|
||||
public final void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
// check the reliability
|
||||
checkReliability(reader);
|
||||
|
||||
try
|
||||
{
|
||||
extractInternal(reader, destination);
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new ContentIOException("Metadata extraction failed: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
// check that the reader and writer are both closed
|
||||
if (!reader.isClosed())
|
||||
{
|
||||
logger.error("Content reader not closed by metadata extracter: \n" + reader);
|
||||
}
|
||||
}
|
||||
|
||||
// done
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Completed metadata extraction: \n" +
|
||||
" reader: " + reader + "\n" +
|
||||
" extracter: " + this);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Override to provide the necessary extraction logic. Implementations must ensure that the reader
|
||||
* is closed before the method exits.
|
||||
*
|
||||
* @param reader the source of the content
|
||||
* @param destination the property map to fill
|
||||
* @throws Throwable an exception
|
||||
*/
|
||||
protected abstract void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable;
|
||||
|
||||
/**
|
||||
* Examines a value or string for nulls and adds it to the map (if
|
||||
* non-empty)
|
||||
|
@@ -16,7 +16,6 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
@@ -34,7 +33,6 @@ import javax.swing.text.html.parser.ParserDelegator;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
@@ -56,11 +54,10 @@ public class HtmlMetadataExtracter extends AbstractMetadataExtracter
|
||||
super(MIMETYPES, 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
final Map<QName, Serializable> tempDestination = new HashMap<QName, Serializable>();
|
||||
try
|
||||
{
|
||||
|
||||
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback()
|
||||
{
|
||||
StringBuffer title = null;
|
||||
@@ -169,9 +166,4 @@ public class HtmlMetadataExtracter extends AbstractMetadataExtracter
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new ContentIOException("HTML metadata extraction failed: \n" + " reader: " + reader, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -17,16 +17,12 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(HtmlMetadataExtracterTest.class);
|
||||
private MetadataExtracter extracter;
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
|
@@ -17,20 +17,17 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.alfresco.util.GUID;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.farng.mp3.AbstractMP3FragmentBody;
|
||||
import org.farng.mp3.MP3File;
|
||||
import org.farng.mp3.TagException;
|
||||
import org.farng.mp3.id3.AbstractID3v2;
|
||||
import org.farng.mp3.id3.AbstractID3v2Frame;
|
||||
import org.farng.mp3.id3.ID3v1;
|
||||
@@ -58,18 +55,14 @@ public class MP3MetadataExtracter extends AbstractMetadataExtracter
|
||||
super(MimetypeMap.MIMETYPE_MP3, 1.0, 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.metadata.MetadataExtracter#extract(org.alfresco.service.cmr.repository.ContentReader, java.util.Map)
|
||||
*/
|
||||
public void extract(ContentReader reader,
|
||||
Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
try
|
||||
public void extractInternal(
|
||||
ContentReader reader,
|
||||
Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
Map<QName, Serializable> props = new HashMap<QName, Serializable>();
|
||||
|
||||
// Create a temp file
|
||||
File tempFile = File.createTempFile(GUID.generate(), ".tmp");
|
||||
File tempFile = TempFileProvider.createTempFile("MP3MetadataExtracter_", ".tmp");
|
||||
try
|
||||
{
|
||||
reader.getContent(tempFile);
|
||||
@@ -146,17 +139,6 @@ public class MP3MetadataExtracter extends AbstractMetadataExtracter
|
||||
destination.put(ContentModel.PROP_DESCRIPTION, description);
|
||||
}
|
||||
}
|
||||
catch (IOException ioException)
|
||||
{
|
||||
// TODO sort out exception handling
|
||||
throw new RuntimeException("Error reading mp3 file.", ioException);
|
||||
}
|
||||
catch (TagException tagException)
|
||||
{
|
||||
// TODO sort out exception handling
|
||||
throw new RuntimeException("Error reading mp3 tag information.", tagException);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
@@ -16,7 +16,7 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -28,7 +28,6 @@ import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.util.Assert;
|
||||
|
||||
/**
|
||||
* Holds and provides the most appropriate metadate extracter for a particular
|
||||
@@ -52,15 +51,10 @@ public class MetadataExtracterRegistry
|
||||
/** controls write access to the cache */
|
||||
private Lock extracterCacheWriteLock;
|
||||
|
||||
/**
|
||||
* @param mimetypeMap all the mimetypes available to the system
|
||||
*/
|
||||
public MetadataExtracterRegistry(MimetypeMap mimetypeMap)
|
||||
public MetadataExtracterRegistry()
|
||||
{
|
||||
Assert.notNull(mimetypeMap, "The MimetypeMap is mandatory");
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
|
||||
extracters = Collections.emptyList(); // just in case it isn't set
|
||||
// initialise lists
|
||||
extracters = new ArrayList<MetadataExtracter>(10);
|
||||
extracterCache = new HashMap<String, MetadataExtracter>(17);
|
||||
|
||||
// create lock objects for access to the cache
|
||||
@@ -69,6 +63,40 @@ public class MetadataExtracterRegistry
|
||||
extracterCacheWriteLock = extractionCacheLock.writeLock();
|
||||
}
|
||||
|
||||
/**
|
||||
* The mimetype map that will be used to check requests against
|
||||
*
|
||||
* @param mimetypeMap a map of mimetypes
|
||||
*/
|
||||
public void setMimetypeMap(MimetypeMap mimetypeMap)
|
||||
{
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register an instance of an extracter for use
|
||||
*
|
||||
* @param extracter an extracter
|
||||
*/
|
||||
public void register(MetadataExtracter extracter)
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("Registering metadata extracter: " + extracter);
|
||||
}
|
||||
|
||||
extracterCacheWriteLock.lock();
|
||||
try
|
||||
{
|
||||
extracters.add(extracter);
|
||||
extracterCache.clear();
|
||||
}
|
||||
finally
|
||||
{
|
||||
extracterCacheWriteLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the best metadata extracter. This is a combination of the most
|
||||
* reliable and the most performant extracter.
|
||||
@@ -123,8 +151,8 @@ public class MetadataExtracterRegistry
|
||||
|
||||
/**
|
||||
* @param sourceMimetype The MIME type under examination
|
||||
* @return The fastest of the most reliable extracters in
|
||||
* <code>extracters</code> for the given MIME type.
|
||||
* @return The fastest of the most reliable extracters in <code>extracters</code>
|
||||
* for the given MIME type, or null if none is available.
|
||||
*/
|
||||
private MetadataExtracter findBestExtracter(String sourceMimetype)
|
||||
{
|
||||
@@ -137,7 +165,12 @@ public class MetadataExtracterRegistry
|
||||
for (MetadataExtracter ext : extracters)
|
||||
{
|
||||
double r = ext.getReliability(sourceMimetype);
|
||||
if (r == bestReliability)
|
||||
if (r <= 0.0)
|
||||
{
|
||||
// extraction not achievable
|
||||
continue;
|
||||
}
|
||||
else if (r == bestReliability)
|
||||
{
|
||||
long time = ext.getExtractionTime();
|
||||
if (time < bestTime)
|
||||
@@ -155,26 +188,4 @@ public class MetadataExtracterRegistry
|
||||
}
|
||||
return bestExtracter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a list of self-discovering extracters.
|
||||
*
|
||||
* @param transformers all the available extracters that the registry can
|
||||
* work with
|
||||
*/
|
||||
public void setExtracters(List<MetadataExtracter> extracters)
|
||||
{
|
||||
logger.debug("Setting " + extracters.size() + "new extracters.");
|
||||
|
||||
extracterCacheWriteLock.lock();
|
||||
try
|
||||
{
|
||||
this.extracters = extracters;
|
||||
this.extracterCache.clear();
|
||||
}
|
||||
finally
|
||||
{
|
||||
extracterCacheWriteLock.unlock();
|
||||
}
|
||||
}
|
||||
}
|
@@ -50,7 +50,7 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, final Map<QName, Serializable> destination) throws ContentIOException
|
||||
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
@@ -96,12 +96,6 @@ public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
poiFSReader.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
|
||||
poiFSReader.read(is);
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new ContentIOException("Compound Document SummaryInformation metadata extraction failed: \n"
|
||||
+ " reader: " + reader,
|
||||
e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
|
@@ -16,6 +16,8 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
@@ -23,11 +25,8 @@ import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.catcode.odf.ODFMetaFileAnalyzer;
|
||||
import com.catcode.odf.OpenDocumentMetadata;
|
||||
@@ -41,8 +40,6 @@ import com.catcode.odf.OpenDocumentMetadata;
|
||||
*/
|
||||
public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(OpenDocumentMetadataExtracter.class);
|
||||
|
||||
private static String[] mimeTypes = new String[] {
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE,
|
||||
@@ -67,13 +64,15 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer();
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
// stream the document in
|
||||
OpenDocumentMetadata docInfo = analyzer.analyzeZip(reader.getContentInputStream());
|
||||
OpenDocumentMetadata docInfo = analyzer.analyzeZip(is);
|
||||
|
||||
if (docInfo != null)
|
||||
{
|
||||
@@ -84,12 +83,12 @@ public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
|
||||
destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate());
|
||||
}
|
||||
}
|
||||
catch (Throwable e)
|
||||
finally
|
||||
{
|
||||
String message = "Metadata extraction failed: \n" +
|
||||
" reader: " + reader;
|
||||
logger.debug(message, e);
|
||||
throw new ContentIOException(message, e);
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -17,17 +17,15 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.pdfbox.pdmodel.PDDocument;
|
||||
import org.pdfbox.pdmodel.PDDocumentInformation;
|
||||
|
||||
@@ -37,26 +35,20 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
|
||||
*/
|
||||
public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracter.class);
|
||||
|
||||
public PdfBoxMetadataExtracter()
|
||||
{
|
||||
super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
public void extractInternal(ContentReader reader, Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
if (!MimetypeMap.MIMETYPE_PDF.equals(reader.getMimetype()))
|
||||
{
|
||||
logger.debug("No metadata extracted for " + reader.getMimetype());
|
||||
return;
|
||||
}
|
||||
PDDocument pdf = null;
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
// stream the document in
|
||||
pdf = PDDocument.load(reader.getContentInputStream());
|
||||
pdf = PDDocument.load(is);
|
||||
// Scoop out the metadata
|
||||
PDDocumentInformation docInfo = pdf.getDocumentInformation();
|
||||
|
||||
@@ -68,23 +60,15 @@ public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
|
||||
if (created != null)
|
||||
destination.put(ContentModel.PROP_CREATED, created.getTime());
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new ContentIOException("PDF metadata extraction failed: \n" +
|
||||
" reader: " + reader);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
if (pdf != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
pdf.close();
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,16 +1,14 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.PdfBoxContentTransformer
|
||||
* @see org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracterTest.class);
|
||||
private MetadataExtracter extracter;
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
|
@@ -1,58 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class StringMetadataExtracter implements MetadataExtracter
|
||||
{
|
||||
public static final String PREFIX_TEXT = "text/";
|
||||
|
||||
private static final Log logger = LogFactory.getLog(StringMetadataExtracter.class);
|
||||
|
||||
public double getReliability(String sourceMimetype)
|
||||
{
|
||||
if (sourceMimetype.startsWith(PREFIX_TEXT))
|
||||
return 0.1;
|
||||
else
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
public long getExtractionTime()
|
||||
{
|
||||
return 1000;
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("No metadata extracted for " + reader.getMimetype());
|
||||
}
|
||||
}
|
||||
}
|
@@ -28,12 +28,9 @@ import net.sf.joott.uno.UnoConnection;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.sun.star.beans.PropertyValue;
|
||||
import com.sun.star.beans.XPropertySet;
|
||||
@@ -49,9 +46,6 @@ import com.sun.star.uno.UnoRuntime;
|
||||
*/
|
||||
public class UnoMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(UnoMetadataExtracter.class);
|
||||
|
||||
private static String[] mimeTypes = new String[] {
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
|
||||
MimetypeMap.MIMETYPE_OPENOFFICE1_WRITER,
|
||||
@@ -60,33 +54,44 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter
|
||||
// quality since they involve conversion.
|
||||
};
|
||||
|
||||
public UnoMetadataExtracter(MimetypeMap mimetypeMap, String connectionUrl)
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 10000);
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
init(connectionUrl);
|
||||
}
|
||||
|
||||
public UnoMetadataExtracter(MimetypeMap mimetypeMap)
|
||||
{
|
||||
this(mimetypeMap, UnoConnection.DEFAULT_CONNECTION_STRING);
|
||||
}
|
||||
|
||||
private MimetypeMap mimetypeMap;
|
||||
private String contentUrl;
|
||||
private MyUnoConnection connection;
|
||||
private boolean isConnected;
|
||||
|
||||
/**
|
||||
* @param unoConnectionUrl the URL of the Uno server
|
||||
*/
|
||||
private synchronized void init(String unoConnectionUrl)
|
||||
public UnoMetadataExtracter()
|
||||
{
|
||||
connection = new MyUnoConnection(unoConnectionUrl);
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 10000);
|
||||
this.contentUrl = UnoConnection.DEFAULT_CONNECTION_STRING;
|
||||
}
|
||||
|
||||
public void setMimetypeMap(MimetypeMap mimetypeMap)
|
||||
{
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param contentUrl the URL to connect to
|
||||
*/
|
||||
public void setContentUrl(String contentUrl)
|
||||
{
|
||||
this.contentUrl = contentUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialises the bean by establishing an UNO connection
|
||||
*/
|
||||
public synchronized void init()
|
||||
{
|
||||
connection = new MyUnoConnection(contentUrl);
|
||||
// attempt to make an connection
|
||||
try
|
||||
{
|
||||
connection.connect();
|
||||
isConnected = true;
|
||||
// register
|
||||
super.register();
|
||||
}
|
||||
catch (ConnectException e)
|
||||
{
|
||||
@@ -103,19 +108,18 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter
|
||||
return isConnected;
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, final Map<QName, Serializable> destination) throws ContentIOException
|
||||
public void extractInternal(ContentReader reader, final Map<QName, Serializable> destination) throws Throwable
|
||||
{
|
||||
String sourceMimetype = reader.getMimetype();
|
||||
|
||||
// create temporary files to convert from and to
|
||||
File tempFromFile = TempFileProvider.createTempFile("UnoContentTransformer", "."
|
||||
File tempFromFile = TempFileProvider.createTempFile(
|
||||
"UnoContentTransformer_", "."
|
||||
+ mimetypeMap.getExtension(sourceMimetype));
|
||||
// download the content from the source reader
|
||||
reader.getContent(tempFromFile);
|
||||
String sourceUrl = tempFromFile.toString();
|
||||
try
|
||||
{
|
||||
sourceUrl = toUrl(tempFromFile, connection);
|
||||
|
||||
String sourceUrl = toUrl(tempFromFile, connection);
|
||||
|
||||
// UNO Interprocess Bridge *should* be thread-safe, but...
|
||||
synchronized (connection)
|
||||
@@ -158,13 +162,6 @@ public class UnoMetadataExtracter extends AbstractMetadataExtracter
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new ContentIOException("Conversion failed: \n" +
|
||||
" source: " + sourceUrl + "\n",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
public String toUrl(File file, MyUnoConnection connection) throws ConnectException
|
||||
{
|
||||
|
@@ -19,7 +19,6 @@ package org.alfresco.repo.content.metadata;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.UnoMetadataExtracter
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
@@ -28,7 +27,8 @@ public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
{
|
||||
extracter = new UnoMetadataExtracter(mimetypeMap);
|
||||
extracter = new UnoMetadataExtracter();
|
||||
extracter.setMimetypeMap(mimetypeMap);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Reference in New Issue
Block a user