mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-08-14 17:58:27 +00:00
Save point: [skip ci]
* Moved iptc reference info to model rather than having a separate models directory * Use InputStream and OutputStream in metadata extractors rather than Files.
This commit is contained in:
@@ -38,7 +38,8 @@ public interface TransformManager
|
|||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Allows a {@link CustomTransformer} to use a local source {@code File} rather than the supplied {@code InputStream}.
|
* Allows a {@link CustomTransformer} to use a local source {@code File} rather than the supplied {@code InputStream}.
|
||||||
* The file will be deleted once the request is completed.
|
* The file will be deleted once the request is completed. To avoid creating extra files, if a File has already
|
||||||
|
* been created by the base t-engine, it is returned.
|
||||||
* If possible this method should be avoided as it is better not to leave content on disk.
|
* If possible this method should be avoided as it is better not to leave content on disk.
|
||||||
* @throws IllegalStateException if this method has already been called.
|
* @throws IllegalStateException if this method has already been called.
|
||||||
*/
|
*/
|
||||||
@@ -46,14 +47,14 @@ public interface TransformManager
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Allows a {@link CustomTransformer} to use a local target {@code File} rather than the supplied {@code OutputStream}.
|
* Allows a {@link CustomTransformer} to use a local target {@code File} rather than the supplied {@code OutputStream}.
|
||||||
* The file will be deleted once the request is completed.
|
* The file will be deleted once the request is completed. To avoid creating extra files, if a File has already
|
||||||
|
* been created by the base t-engine, it is returned.
|
||||||
* If possible this method should be avoided as it is better not to leave content on disk.
|
* If possible this method should be avoided as it is better not to leave content on disk.
|
||||||
* @throws IllegalStateException if this method has already been called. A call to {@link #respondWithFragment(Integer)}
|
* @throws IllegalStateException if this method has already been called. A call to {@link #respondWithFragment(Integer)}
|
||||||
* allows the method to be called again.
|
* allows the method to be called again.
|
||||||
*/
|
*/
|
||||||
File createTargetFile();
|
File createTargetFile();
|
||||||
|
|
||||||
// TODO: Do we want to support the following?
|
|
||||||
/**
|
/**
|
||||||
* Allows a single transform request to have multiple transform responses. For example images from a video at
|
* Allows a single transform request to have multiple transform responses. For example images from a video at
|
||||||
* different time offsets or different pages of a document. Following a call to this method a transform response is
|
* different time offsets or different pages of a document. Following a call to this method a transform response is
|
||||||
|
@@ -33,7 +33,6 @@ import org.alfresco.transform.base.CustomTransformer;
|
|||||||
import org.alfresco.transform.base.TransformManager;
|
import org.alfresco.transform.base.TransformManager;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
@@ -64,19 +63,17 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
|||||||
*
|
*
|
||||||
* The transform results in a Map of extracted properties encoded as json being returned to the content repository.
|
* The transform results in a Map of extracted properties encoded as json being returned to the content repository.
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>The content repository will use a transform in preference to any metadata extractors it might have defined
|
* <li>The method extracts ALL available metadata from the document with
|
||||||
* locally for the same MIMETYPE.</li>
|
* {@link #extractMetadata(String, InputStream, String, OutputStream, Map, TransformManager)} and then calls
|
||||||
* <li>The T-Engine's Controller class will call a method in a class that extends {@link AbstractMetadataExtractor}
|
* {@link #mapMetadataAndWrite(OutputStream, Map, Map)}.</li>
|
||||||
* based on the source and target mediatypes in the normal way.</li>
|
|
||||||
* <li>The method extracts ALL available metadata is extracted from the document and then calls
|
|
||||||
* {@link #mapMetadataAndWrite(File, Map, Map)}.</li>
|
|
||||||
* <li>Selected values from the available metadata are mapped into content repository property names and values,
|
* <li>Selected values from the available metadata are mapped into content repository property names and values,
|
||||||
* depending on what is defined in a {@code "<classname>_metadata_extract.properties"} file.</li>
|
* depending on what is defined in a {@code "<classname>_metadata_extract.properties"} file.</li>
|
||||||
* <li>The selected values are set back to the content repository as a JSON representation of a Map, where the values
|
* <li>The selected values are set back to the content repository as a JSON representation of a Map, where the values
|
||||||
* are applied to the source node.</li>
|
* are applied to the source node.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* To support the same functionality as metadata extractors configured inside the content repository,
|
* To support the same functionality as metadata extractors configured inside the content repository,
|
||||||
* extra key value pairs may be returned from {@link #extractMetadata}. These are:
|
* extra key value pairs may be returned from {@link #extractMetadata(String, InputStream, String, OutputStream, Map, TransformManager)}.
|
||||||
|
* These are:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>{@code "sys:overwritePolicy"} which can specify the
|
* <li>{@code "sys:overwritePolicy"} which can specify the
|
||||||
* {@code org.alfresco.repo.content.metadata.MetadataExtracter.OverwritePolicy} name. Defaults to "PRAGMATIC".</li>
|
* {@code org.alfresco.repo.content.metadata.MetadataExtracter.OverwritePolicy} name. Defaults to "PRAGMATIC".</li>
|
||||||
@@ -89,7 +86,8 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
|||||||
* If a transform specifies that it can convert from {@code "<MIMETYPE>"} to {@code "alfresco-metadata-embed"}, it is
|
* If a transform specifies that it can convert from {@code "<MIMETYPE>"} to {@code "alfresco-metadata-embed"}, it is
|
||||||
* indicating that it can embed metadata in {@code <MIMETYPE>}.
|
* indicating that it can embed metadata in {@code <MIMETYPE>}.
|
||||||
*
|
*
|
||||||
* The transform results in a new version of supplied source file that contains the metadata supplied in the transform
|
* The transform calls {@link #embedMetadata(String, InputStream, String, OutputStream, Map, TransformManager)}
|
||||||
|
* which should results in a new version of supplied source file that contains the metadata supplied in the transform
|
||||||
* options.
|
* options.
|
||||||
*
|
*
|
||||||
* @author Jesper Steen Møller
|
* @author Jesper Steen Møller
|
||||||
@@ -162,24 +160,13 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
extractMetadata(sourceMimetype, inputStream, targetMimetype, outputStream, transformOptions, transformManager);
|
extractMapAndWriteMetadata(sourceMimetype, inputStream, targetMimetype, outputStream, transformOptions, transformManager);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void embedMetadata(String sourceMimetype, InputStream inputStream,
|
public abstract void embedMetadata(String sourceMimetype, InputStream inputStream, String targetMimetype,
|
||||||
String targetMimetype, OutputStream outputStream,
|
OutputStream outputStream, Map<String, String> transformOptions, TransformManager transformManager)
|
||||||
Map<String, String> transformOptions, TransformManager transformManager) throws Exception
|
throws Exception;
|
||||||
{
|
|
||||||
File sourceFile = transformManager.createSourceFile();
|
|
||||||
File targetFile = transformManager.createTargetFile();
|
|
||||||
embedMetadata(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void embedMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
|
|
||||||
File sourceFile, File targetFile) throws Exception
|
|
||||||
{
|
|
||||||
// Default nothing, as embedding is not supported in most cases
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Map<String, Serializable> getMetadata(Map<String, String> transformOptions)
|
protected Map<String, Serializable> getMetadata(Map<String, String> transformOptions)
|
||||||
{
|
{
|
||||||
@@ -507,31 +494,18 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void extractMetadata(String sourceMimetype, InputStream inputStream,
|
private void extractMapAndWriteMetadata(String sourceMimetype, InputStream inputStream, String targetMimetype,
|
||||||
String targetMimetype, OutputStream outputStream,
|
OutputStream outputStream, Map<String, String> transformOptions, TransformManager transformManager)
|
||||||
Map<String, String> transformOptions, TransformManager transformManager) throws Exception
|
throws Exception
|
||||||
{
|
{
|
||||||
File sourceFile = transformManager.createSourceFile();
|
|
||||||
File targetFile = transformManager.createTargetFile();
|
|
||||||
extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The {@code transformOptions} may contain a replacement set of mappings. These will be used in place of the
|
|
||||||
* default mappings from read from file if supplied.
|
|
||||||
*/
|
|
||||||
public void extractMetadata(String sourceMimetype, Map<String, String> transformOptions, File sourceFile,
|
|
||||||
File targetFile) throws Exception
|
|
||||||
{
|
|
||||||
Map<String, Set<String>> mapping = getExtractMappingFromOptions(transformOptions, defaultExtractMapping);
|
|
||||||
|
|
||||||
// Use a ThreadLocal to avoid changing method signatures of methods that currently call getExtractMapping.
|
// Use a ThreadLocal to avoid changing method signatures of methods that currently call getExtractMapping.
|
||||||
|
Map<String, Set<String>> mapping = getExtractMappingFromOptions(transformOptions, defaultExtractMapping);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
extractMapping.set(mapping);
|
extractMapping.set(mapping);
|
||||||
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile);
|
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, inputStream, targetMimetype,
|
||||||
mapMetadataAndWrite(targetFile, metadata, mapping);
|
outputStream, transformOptions, transformManager);
|
||||||
|
mapMetadataAndWrite(outputStream, metadata, mapping);
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
{
|
{
|
||||||
@@ -539,8 +513,9 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public abstract Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
public abstract Map<String, Serializable> extractMetadata(String sourceMimetype, InputStream inputStream,
|
||||||
File sourceFile) throws Exception;
|
String targetMimetype, OutputStream outputStream, Map<String, String> transformOptions,
|
||||||
|
TransformManager transformManager) throws Exception;
|
||||||
|
|
||||||
private Map<String, Set<String>> getExtractMappingFromOptions(Map<String, String> transformOptions, Map<String,
|
private Map<String, Set<String>> getExtractMappingFromOptions(Map<String, String> transformOptions, Map<String,
|
||||||
Set<String>> defaultExtractMapping)
|
Set<String>> defaultExtractMapping)
|
||||||
@@ -561,17 +536,7 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
|
|||||||
return defaultExtractMapping;
|
return defaultExtractMapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public void mapMetadataAndWrite(OutputStream outputStream, Map<String, Serializable> metadata,
|
||||||
* @deprecated use {@link #extractMetadata(String, Map, File, File)} rather than calling this method.
|
|
||||||
* By default call the overloaded method with the default {@code extractMapping}.
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public void mapMetadataAndWrite(File targetFile, Map<String, Serializable> metadata) throws IOException
|
|
||||||
{
|
|
||||||
mapMetadataAndWrite(targetFile, metadata, defaultExtractMapping);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void mapMetadataAndWrite(File targetFile, Map<String, Serializable> metadata,
|
|
||||||
Map<String, Set<String>> extractMapping) throws IOException
|
Map<String, Set<String>> extractMapping) throws IOException
|
||||||
{
|
{
|
||||||
if (logger.isDebugEnabled())
|
if (logger.isDebugEnabled())
|
||||||
@@ -581,7 +546,7 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
|
|||||||
}
|
}
|
||||||
|
|
||||||
metadata = mapRawToSystem(metadata, extractMapping);
|
metadata = mapRawToSystem(metadata, extractMapping);
|
||||||
writeMetadata(targetFile, metadata);
|
writeMetadata(outputStream, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -633,9 +598,9 @@ public abstract class AbstractMetadataExtractor implements CustomTransformer
|
|||||||
return new TreeMap<String, Serializable>(systemProperties);
|
return new TreeMap<String, Serializable>(systemProperties);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeMetadata(File targetFile, Map<String, Serializable> results)
|
private void writeMetadata(OutputStream outputStream, Map<String, Serializable> results)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
jsonObjectMapper.writeValue(targetFile, results);
|
jsonObjectMapper.writeValue(outputStream, results);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -29,7 +29,6 @@ package org.alfresco.transform.misc.metadataExtractors;
|
|||||||
import org.alfresco.transform.base.CustomTransformer;
|
import org.alfresco.transform.base.CustomTransformer;
|
||||||
import org.alfresco.transform.base.TransformManager;
|
import org.alfresco.transform.base.TransformManager;
|
||||||
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
||||||
import org.alfresco.transform.common.TransformException;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -87,8 +86,17 @@ public class HtmlMetadataExtractor extends AbstractMetadataExtractor implements
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
public void embedMetadata(String sourceMimetype, InputStream inputStream, String targetMimetype,
|
||||||
File sourceFile) throws Exception
|
OutputStream outputStream, Map<String, String> transformOptions, TransformManager transformManager)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
// Only used for extract, so may be empty.
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, Serializable> extractMetadata(String sourceMimetype, InputStream inputStream,
|
||||||
|
String targetMimetype, OutputStream outputStream, Map<String, String> transformOptions,
|
||||||
|
TransformManager transformManager) throws Exception
|
||||||
{
|
{
|
||||||
final Map<String, Serializable> rawProperties = new HashMap<>();
|
final Map<String, Serializable> rawProperties = new HashMap<>();
|
||||||
|
|
||||||
@@ -175,10 +183,10 @@ public class HtmlMetadataExtractor extends AbstractMetadataExtractor implements
|
|||||||
rawProperties.clear();
|
rawProperties.clear();
|
||||||
Reader r = null;
|
Reader r = null;
|
||||||
|
|
||||||
try (InputStream cis = new FileInputStream(sourceFile))
|
try
|
||||||
{
|
{
|
||||||
// TODO: for now, use default charset; we should attempt to map from html meta-data
|
// TODO: for now, use default charset; we should attempt to map from html meta-data
|
||||||
r = new InputStreamReader(cis, charsetGuess);
|
r = new InputStreamReader(inputStream, charsetGuess);
|
||||||
HTMLEditorKit.Parser parser = new ParserDelegator();
|
HTMLEditorKit.Parser parser = new ParserDelegator();
|
||||||
parser.parse(r, callback, tries > 0);
|
parser.parse(r, callback, tries > 0);
|
||||||
break;
|
break;
|
||||||
|
@@ -29,7 +29,6 @@ package org.alfresco.transform.misc.metadataExtractors;
|
|||||||
import org.alfresco.transform.base.CustomTransformer;
|
import org.alfresco.transform.base.CustomTransformer;
|
||||||
import org.alfresco.transform.base.TransformManager;
|
import org.alfresco.transform.base.TransformManager;
|
||||||
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
||||||
import org.alfresco.transform.common.TransformException;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -89,102 +88,108 @@ public class RFC822MetadataExtractor extends AbstractMetadataExtractor implement
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
public void embedMetadata(String sourceMimetype, InputStream inputStream, String targetMimetype,
|
||||||
File sourceFile) throws Exception
|
OutputStream outputStream, Map<String, String> transformOptions, TransformManager transformManager)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
// Only used for extract, so may be empty.
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, Serializable> extractMetadata(String sourceMimetype, InputStream inputStream,
|
||||||
|
String targetMimetype, OutputStream outputStream, Map<String, String> transformOptions,
|
||||||
|
TransformManager transformManager) throws Exception
|
||||||
{
|
{
|
||||||
final Map<String, Serializable> rawProperties = new HashMap<>();
|
final Map<String, Serializable> rawProperties = new HashMap<>();
|
||||||
|
|
||||||
try (InputStream is = new FileInputStream(sourceFile))
|
MimeMessage mimeMessage = new MimeMessage(null, inputStream);
|
||||||
|
|
||||||
|
if (mimeMessage != null)
|
||||||
{
|
{
|
||||||
MimeMessage mimeMessage = new MimeMessage(null, is);
|
/**
|
||||||
|
* Extract RFC822 values that doesn't match to headers and need to be encoded.
|
||||||
|
* Or those special fields that require some code to extract data
|
||||||
|
*/
|
||||||
|
String tmp = InternetAddress.toString(mimeMessage.getFrom());
|
||||||
|
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||||
|
putRawValue(KEY_MESSAGE_FROM, tmp, rawProperties);
|
||||||
|
|
||||||
if (mimeMessage != null)
|
tmp = InternetAddress.toString(mimeMessage.getRecipients(RecipientType.TO));
|
||||||
|
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||||
|
putRawValue(KEY_MESSAGE_TO, tmp, rawProperties);
|
||||||
|
|
||||||
|
tmp = InternetAddress.toString(mimeMessage.getRecipients(RecipientType.CC));
|
||||||
|
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||||
|
putRawValue(KEY_MESSAGE_CC, tmp, rawProperties);
|
||||||
|
|
||||||
|
putRawValue(KEY_MESSAGE_SENT, mimeMessage.getSentDate(), rawProperties);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Received field from RFC 822
|
||||||
|
*
|
||||||
|
* "Received" ":" ; one per relay
|
||||||
|
* ["from" domain] ; sending host
|
||||||
|
* ["by" domain] ; receiving host
|
||||||
|
* ["via" atom] ; physical path
|
||||||
|
* ("with" atom) ; link/mail protocol
|
||||||
|
* ["id" msg-id] ; receiver msg id
|
||||||
|
* ["for" addr-spec] ; initial form
|
||||||
|
* ";" date-time ; time received
|
||||||
|
*/
|
||||||
|
Date rxDate = mimeMessage.getReceivedDate();
|
||||||
|
|
||||||
|
if(rxDate != null)
|
||||||
{
|
{
|
||||||
/**
|
// The email implementation extracted the received date for us.
|
||||||
* Extract RFC822 values that doesn't match to headers and need to be encoded.
|
putRawValue(KEY_MESSAGE_RECEIVED, rxDate, rawProperties);
|
||||||
* Or those special fields that require some code to extract data
|
}
|
||||||
*/
|
else
|
||||||
String tmp = InternetAddress.toString(mimeMessage.getFrom());
|
{
|
||||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
// the email implementation did not parse the received date for us.
|
||||||
putRawValue(KEY_MESSAGE_FROM, tmp, rawProperties);
|
String[] rx = mimeMessage.getHeader("received");
|
||||||
|
if(rx != null && rx.length > 0)
|
||||||
tmp = InternetAddress.toString(mimeMessage.getRecipients(RecipientType.TO));
|
|
||||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
|
||||||
putRawValue(KEY_MESSAGE_TO, tmp, rawProperties);
|
|
||||||
|
|
||||||
tmp = InternetAddress.toString(mimeMessage.getRecipients(RecipientType.CC));
|
|
||||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
|
||||||
putRawValue(KEY_MESSAGE_CC, tmp, rawProperties);
|
|
||||||
|
|
||||||
putRawValue(KEY_MESSAGE_SENT, mimeMessage.getSentDate(), rawProperties);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Received field from RFC 822
|
|
||||||
*
|
|
||||||
* "Received" ":" ; one per relay
|
|
||||||
* ["from" domain] ; sending host
|
|
||||||
* ["by" domain] ; receiving host
|
|
||||||
* ["via" atom] ; physical path
|
|
||||||
* ("with" atom) ; link/mail protocol
|
|
||||||
* ["id" msg-id] ; receiver msg id
|
|
||||||
* ["for" addr-spec] ; initial form
|
|
||||||
* ";" date-time ; time received
|
|
||||||
*/
|
|
||||||
Date rxDate = mimeMessage.getReceivedDate();
|
|
||||||
|
|
||||||
if(rxDate != null)
|
|
||||||
{
|
{
|
||||||
// The email implementation extracted the received date for us.
|
String lastReceived = rx[0];
|
||||||
putRawValue(KEY_MESSAGE_RECEIVED, rxDate, rawProperties);
|
lastReceived = MimeUtility.unfold(lastReceived);
|
||||||
}
|
int x = lastReceived.lastIndexOf(';');
|
||||||
else
|
if(x > 0)
|
||||||
{
|
|
||||||
// the email implementation did not parse the received date for us.
|
|
||||||
String[] rx = mimeMessage.getHeader("received");
|
|
||||||
if(rx != null && rx.length > 0)
|
|
||||||
{
|
{
|
||||||
String lastReceived = rx[0];
|
String dateStr = lastReceived.substring(x + 1).trim();
|
||||||
lastReceived = MimeUtility.unfold(lastReceived);
|
putRawValue(KEY_MESSAGE_RECEIVED, dateStr, rawProperties);
|
||||||
int x = lastReceived.lastIndexOf(';');
|
|
||||||
if(x > 0)
|
|
||||||
{
|
|
||||||
String dateStr = lastReceived.substring(x + 1).trim();
|
|
||||||
putRawValue(KEY_MESSAGE_RECEIVED, dateStr, rawProperties);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
String[] subj = mimeMessage.getHeader("Subject");
|
String[] subj = mimeMessage.getHeader("Subject");
|
||||||
if (subj != null && subj.length > 0)
|
if (subj != null && subj.length > 0)
|
||||||
|
{
|
||||||
|
String decodedSubject = subj[0];
|
||||||
|
try
|
||||||
{
|
{
|
||||||
String decodedSubject = subj[0];
|
decodedSubject = MimeUtility.decodeText(decodedSubject);
|
||||||
try
|
|
||||||
{
|
|
||||||
decodedSubject = MimeUtility.decodeText(decodedSubject);
|
|
||||||
}
|
|
||||||
catch (UnsupportedEncodingException e)
|
|
||||||
{
|
|
||||||
logger.warn(e.toString());
|
|
||||||
}
|
|
||||||
putRawValue(KEY_MESSAGE_SUBJECT, decodedSubject, rawProperties);
|
|
||||||
}
|
}
|
||||||
|
catch (UnsupportedEncodingException e)
|
||||||
/*
|
|
||||||
* Extract values from all header fields, including extension fields "X-"
|
|
||||||
*/
|
|
||||||
Set<String> keys = getExtractMapping().keySet();
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
Enumeration<Header> headers = mimeMessage.getAllHeaders();
|
|
||||||
while (headers.hasMoreElements())
|
|
||||||
{
|
{
|
||||||
Header header = (Header) headers.nextElement();
|
logger.warn(e.toString());
|
||||||
if (keys.contains(header.getName()))
|
}
|
||||||
{
|
putRawValue(KEY_MESSAGE_SUBJECT, decodedSubject, rawProperties);
|
||||||
tmp = header.getValue();
|
}
|
||||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
|
||||||
|
|
||||||
putRawValue(header.getName(), tmp, rawProperties);
|
/*
|
||||||
}
|
* Extract values from all header fields, including extension fields "X-"
|
||||||
|
*/
|
||||||
|
Set<String> keys = getExtractMapping().keySet();
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
Enumeration<Header> headers = mimeMessage.getAllHeaders();
|
||||||
|
while (headers.hasMoreElements())
|
||||||
|
{
|
||||||
|
Header header = (Header) headers.nextElement();
|
||||||
|
if (keys.contains(header.getName()))
|
||||||
|
{
|
||||||
|
tmp = header.getValue();
|
||||||
|
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||||
|
|
||||||
|
putRawValue(header.getName(), tmp, rawProperties);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -51,7 +51,6 @@ import org.xml.sax.Attributes;
|
|||||||
import org.xml.sax.ContentHandler;
|
import org.xml.sax.ContentHandler;
|
||||||
import org.xml.sax.Locator;
|
import org.xml.sax.Locator;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
@@ -215,95 +214,93 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
public Map<String, Serializable> extractMetadata(String sourceMimetype, InputStream inputStream,
|
||||||
File sourceFile) throws Exception
|
String targetMimetype, OutputStream outputStream, Map<String, String> transformOptions,
|
||||||
|
TransformManager transformManager) throws Exception
|
||||||
{
|
{
|
||||||
Map<String, Serializable> rawProperties = new HashMap<>();
|
Map<String, Serializable> rawProperties = new HashMap<>();
|
||||||
|
|
||||||
try (InputStream is = new FileInputStream(sourceFile))
|
Parser parser = getParser();
|
||||||
|
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
|
metadata.add(Metadata.CONTENT_TYPE, sourceMimetype);
|
||||||
|
|
||||||
|
ParseContext context = buildParseContext(metadata, sourceMimetype);
|
||||||
|
|
||||||
|
ContentHandler handler;
|
||||||
|
Map<String,String> headers = null;
|
||||||
|
if (needHeaderContents())
|
||||||
{
|
{
|
||||||
Parser parser = getParser();
|
MapCaptureContentHandler headerCapture =
|
||||||
|
new MapCaptureContentHandler();
|
||||||
Metadata metadata = new Metadata();
|
headers = headerCapture.tags;
|
||||||
metadata.add(Metadata.CONTENT_TYPE, sourceMimetype);
|
handler = new HeadContentHandler(headerCapture);
|
||||||
|
|
||||||
ParseContext context = buildParseContext(metadata, sourceMimetype);
|
|
||||||
|
|
||||||
ContentHandler handler;
|
|
||||||
Map<String,String> headers = null;
|
|
||||||
if (needHeaderContents())
|
|
||||||
{
|
|
||||||
MapCaptureContentHandler headerCapture =
|
|
||||||
new MapCaptureContentHandler();
|
|
||||||
headers = headerCapture.tags;
|
|
||||||
handler = new HeadContentHandler(headerCapture);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
handler = new NullContentHandler();
|
|
||||||
}
|
|
||||||
|
|
||||||
parser.parse(is, handler, metadata, context);
|
|
||||||
|
|
||||||
// First up, copy all the Tika metadata over
|
|
||||||
// This allows people to map any of the Tika
|
|
||||||
// keys onto their own content model
|
|
||||||
for (String tikaKey : metadata.names())
|
|
||||||
{
|
|
||||||
// TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
|
|
||||||
putRawValue(tikaKey, getMetadataValue(metadata, Property.internalText(tikaKey)), rawProperties);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now, map the common Tika metadata keys onto
|
|
||||||
// the common Alfresco metadata keys. This allows
|
|
||||||
// existing mapping properties files to continue
|
|
||||||
// to work without needing any changes
|
|
||||||
|
|
||||||
// The simple ones
|
|
||||||
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, TikaCoreProperties.CREATOR), rawProperties);
|
|
||||||
putRawValue(KEY_TITLE, getMetadataValue(metadata, TikaCoreProperties.TITLE), rawProperties);
|
|
||||||
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, TikaCoreProperties.COMMENTS), rawProperties);
|
|
||||||
|
|
||||||
// Tags
|
|
||||||
putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties);
|
|
||||||
|
|
||||||
// Get the subject and description, despite things not
|
|
||||||
// being nearly as consistent as one might hope
|
|
||||||
String subject = getMetadataValue(metadata, TikaCoreProperties.SUBJECT);
|
|
||||||
String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION);
|
|
||||||
if (subject != null && description != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
|
||||||
putRawValue(KEY_SUBJECT, subject, rawProperties);
|
|
||||||
}
|
|
||||||
else if (subject != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_DESCRIPTION, subject, rawProperties);
|
|
||||||
putRawValue(KEY_SUBJECT, subject, rawProperties);
|
|
||||||
}
|
|
||||||
else if (description != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
|
||||||
putRawValue(KEY_SUBJECT, description, rawProperties);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try for the dates two different ways too
|
|
||||||
if (metadata.get(TikaCoreProperties.CREATED) != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.CREATED), rawProperties);
|
|
||||||
}
|
|
||||||
else if (metadata.get(TikaCoreProperties.MODIFIED) != null)
|
|
||||||
{
|
|
||||||
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.MODIFIED), rawProperties);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If people created a specific instance
|
|
||||||
// (eg OfficeMetadataExtractor), then allow that
|
|
||||||
// instance to map the Tika keys onto its
|
|
||||||
// existing namespace so that older properties
|
|
||||||
// files continue to map correctly
|
|
||||||
rawProperties = extractSpecific(metadata, rawProperties, headers);
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
handler = new NullContentHandler();
|
||||||
|
}
|
||||||
|
|
||||||
|
parser.parse(inputStream, handler, metadata, context);
|
||||||
|
|
||||||
|
// First up, copy all the Tika metadata over
|
||||||
|
// This allows people to map any of the Tika
|
||||||
|
// keys onto their own content model
|
||||||
|
for (String tikaKey : metadata.names())
|
||||||
|
{
|
||||||
|
// TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
|
||||||
|
putRawValue(tikaKey, getMetadataValue(metadata, Property.internalText(tikaKey)), rawProperties);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now, map the common Tika metadata keys onto
|
||||||
|
// the common Alfresco metadata keys. This allows
|
||||||
|
// existing mapping properties files to continue
|
||||||
|
// to work without needing any changes
|
||||||
|
|
||||||
|
// The simple ones
|
||||||
|
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, TikaCoreProperties.CREATOR), rawProperties);
|
||||||
|
putRawValue(KEY_TITLE, getMetadataValue(metadata, TikaCoreProperties.TITLE), rawProperties);
|
||||||
|
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, TikaCoreProperties.COMMENTS), rawProperties);
|
||||||
|
|
||||||
|
// Tags
|
||||||
|
putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties);
|
||||||
|
|
||||||
|
// Get the subject and description, despite things not
|
||||||
|
// being nearly as consistent as one might hope
|
||||||
|
String subject = getMetadataValue(metadata, TikaCoreProperties.SUBJECT);
|
||||||
|
String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION);
|
||||||
|
if (subject != null && description != null)
|
||||||
|
{
|
||||||
|
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||||
|
putRawValue(KEY_SUBJECT, subject, rawProperties);
|
||||||
|
}
|
||||||
|
else if (subject != null)
|
||||||
|
{
|
||||||
|
putRawValue(KEY_DESCRIPTION, subject, rawProperties);
|
||||||
|
putRawValue(KEY_SUBJECT, subject, rawProperties);
|
||||||
|
}
|
||||||
|
else if (description != null)
|
||||||
|
{
|
||||||
|
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||||
|
putRawValue(KEY_SUBJECT, description, rawProperties);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try for the dates two different ways too
|
||||||
|
if (metadata.get(TikaCoreProperties.CREATED) != null)
|
||||||
|
{
|
||||||
|
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.CREATED), rawProperties);
|
||||||
|
}
|
||||||
|
else if (metadata.get(TikaCoreProperties.MODIFIED) != null)
|
||||||
|
{
|
||||||
|
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.MODIFIED), rawProperties);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If people created a specific instance
|
||||||
|
// (eg OfficeMetadataExtractor), then allow that
|
||||||
|
// instance to map the Tika keys onto its
|
||||||
|
// existing namespace so that older properties
|
||||||
|
// files continue to map correctly
|
||||||
|
rawProperties = extractSpecific(metadata, rawProperties, headers);
|
||||||
|
|
||||||
return rawProperties;
|
return rawProperties;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user