diff --git a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java index 82d3205129..8d0fb135a0 100644 --- a/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/DWGMetadataExtracter.java @@ -66,7 +66,7 @@ public class DWGMetadataExtracter extends TikaPoweredMetadataExtracter @Override protected Map extractSpecific(Metadata metadata, - Map properties) { + Map properties, Map headers) { putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties); return properties; diff --git a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java index b9921e6244..1cb213f75e 100644 --- a/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MP3MetadataExtracter.java @@ -81,7 +81,7 @@ public class MP3MetadataExtracter extends TikaPoweredMetadataExtracter @Override protected Map extractSpecific(Metadata metadata, - Map properties) { + Map properties, Map headers) { putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties); putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties); putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties); diff --git a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java index a8f9700da4..08ca3802b4 100644 --- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracter.java @@ -18,20 +18,14 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; -import java.io.InputStream; import java.io.Serializable; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; import java.util.Map; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; -import org.apache.poi.hsmf.MAPIMessage; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; -//import org.apache.tika.parser.microsoft.OutlookExtractor; // TODO fix import +import org.apache.tika.parser.microsoft.OfficeParser; /** * Outlook MAPI format email meta-data extractor extracting the following values: @@ -41,6 +35,9 @@ import org.apache.tika.parser.Parser; * addressee: -- cm:addressee * addressees: -- cm:addressees * subjectLine: -- cm:subjectline, cm:description + * toNames: -- + * ccNames: -- + * bccNames: -- * * * TIKA note - to/cc/bcc go into the html part, not the metadata. @@ -56,6 +53,9 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter private static final String KEY_ADDRESSEE = "addressee"; private static final String KEY_ADDRESSEES = "addressees"; private static final String KEY_SUBJECT = "subjectLine"; + private static final String KEY_TO_NAMES = "toNames"; + private static final String KEY_CC_NAMES = "ccNames"; + private static final String KEY_BCC_NAMES = "bccNames"; public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( new String[] {MimetypeMap.MIMETYPE_OUTLOOK_MSG}, @@ -69,58 +69,29 @@ public class MailMetadataExtracter extends TikaPoweredMetadataExtracter @Override protected Parser getParser() { - //return new OutlookExtractor(); // TODO fix import - return null; + // The office parser does Outlook as well as Word, Excel etc + return new OfficeParser(); } @Override protected Map extractSpecific(Metadata metadata, - Map properties) { - // TODO move things from extractRaw to here + Map properties, Map headers) { + putRawValue(KEY_ORIGINATOR, metadata.get(Metadata.AUTHOR), properties); + putRawValue(KEY_SUBJECT, metadata.get(Metadata.TITLE), properties); + putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), properties); + putRawValue(KEY_SENT_DATE, metadata.get(Metadata.LAST_SAVED), properties); + + // Store the TO, but not cc/bcc in the addressee field + putRawValue(KEY_ADDRESSEE, metadata.get(Metadata.MESSAGE_TO), properties); + + // Store each of To, CC and BCC in their own fields + putRawValue(KEY_TO_NAMES, metadata.get(Metadata.MESSAGE_TO), properties); + putRawValue(KEY_CC_NAMES, metadata.get(Metadata.MESSAGE_CC), properties); + putRawValue(KEY_BCC_NAMES, metadata.get(Metadata.MESSAGE_BCC), properties); + + // But store all email addresses (to/cc/bcc) in the addresses field + putRawValue(KEY_ADDRESSEES, metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS), properties); + return properties; } - - @Override - public Map extractRaw(ContentReader reader) throws Throwable - { - // TODO remove this in favour of extractSpecific - final Map rawProperties = newRawMap(); - - InputStream is = null; - try - { - is = reader.getContentInputStream(); - MAPIMessage msg; - - try - { - msg = new MAPIMessage(is); - msg.setReturnNullOnMissingChunk(true); - - putRawValue(KEY_ORIGINATOR, msg.getDisplayFrom(), rawProperties); - putRawValue(KEY_SUBJECT, msg.getSubject(), rawProperties); - putRawValue(KEY_SENT_DATE, msg.getMessageDate().getTime(), rawProperties); - - // Store the TO, but not cc/bcc in the addressee field - putRawValue(KEY_ADDRESSEE, msg.getDisplayTo(), rawProperties); - // But store all email addresses (to/cc/bcc) in the addresses field - putRawValue(KEY_ADDRESSEES, msg.getRecipientEmailAddressList(), rawProperties); - } - catch (IOException err) - { - // probably not an Outlook format MSG - ignore for now - if (logger.isWarnEnabled()) - logger.warn("Unable to extract meta-data from message: " + err.getMessage()); - } - } - finally - { - if (is != null) - { - try { is.close(); } catch (IOException e) {} - } - } - // Done - return rawProperties; - } } diff --git a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracterTest.java index f01af8a74e..abffac8d20 100644 --- a/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/MailMetadataExtracterTest.java @@ -113,19 +113,14 @@ public class MailMetadataExtracterTest extends AbstractMetadataExtracterTest DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ADDRESSEE))); // Addressees - Collection addressees = (Collection)properties.get(ContentModel.PROP_ADDRESSEES); assertTrue( "Property " + ContentModel.PROP_ADDRESSEES + " not found for mimetype " + mimetype, - addressees != null + properties.get(ContentModel.PROP_ADDRESSEES) != null ); - assertEquals( - "Property " + ContentModel.PROP_ADDRESSEES + " wrong size for mimetype " + mimetype, - 1, - addressees.size()); assertEquals( "Property " + ContentModel.PROP_ADDRESSEES + " wrong content for mimetype " + mimetype, "kevin.roast@alfresco.org", - DefaultTypeConverter.INSTANCE.convert(String.class, addressees.iterator().next())); + DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_ADDRESSEES))); // Subject Line assertEquals( diff --git a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java index bb88c51376..25c6da8aa6 100644 --- a/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OfficeMetadataExtracter.java @@ -49,7 +49,7 @@ import org.apache.tika.parser.microsoft.OfficeParser; * * * Uses Apache Tika - + * * @author Derek Hulley * @author Nick Burch */ @@ -92,7 +92,7 @@ public class OfficeMetadataExtracter extends TikaPoweredMetadataExtracter @Override protected Map extractSpecific(Metadata metadata, - Map properties) { + Map properties, Map headers) { putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties); putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties); putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties); diff --git a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java index 1fa21e6598..ace7072818 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenDocumentMetadataExtracter.java @@ -18,25 +18,19 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; -import java.io.InputStream; import java.io.Serializable; import java.text.ParseException; import java.text.SimpleDateFormat; -import java.util.Arrays; +import java.util.ArrayList; import java.util.Date; -import java.util.HashSet; import java.util.Map; import java.util.Set; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.namespace.QName; import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.odf.OpenDocumentParser; -import org.apache.tika.sax.BodyContentHandler; -import org.xml.sax.ContentHandler; /** @@ -59,31 +53,30 @@ import org.xml.sax.ContentHandler; * All user properties * * - * TIKA Note - this has been converted to deep-call into Tika. - * This will be replaced with proper calls to Tika at a later date. - * Everything except some Print info has been ported to Tika. + * Uses Apache Tika + * + * TODO decide if we need the few print info bits that + * Tika currently doesn't handle * * @author Antti Jokipii * @author Derek Hulley */ -public class OpenDocumentMetadataExtracter extends AbstractMappingMetadataExtracter +public class OpenDocumentMetadataExtracter extends TikaPoweredMetadataExtracter { private static final String KEY_CREATION_DATE = "creationDate"; private static final String KEY_CREATOR = "creator"; private static final String KEY_DATE = "date"; - private static final String KEY_DESCRIPTION = "description"; private static final String KEY_GENERATOR = "generator"; private static final String KEY_INITIAL_CREATOR = "initialCreator"; private static final String KEY_KEYWORD = "keyword"; private static final String KEY_LANGUAGE = "language"; private static final String KEY_PRINT_DATE = "printDate"; private static final String KEY_PRINTED_BY = "printedBy"; - private static final String KEY_SUBJECT = "subject"; - private static final String KEY_TITLE = "title"; private static final String CUSTOM_PREFIX = "custom:"; - public static String[] SUPPORTED_MIMETYPES = new String[] { + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] { MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, MimetypeMap.MIMETYPE_OPENDOCUMENT_GRAPHICS, @@ -100,71 +93,55 @@ public class OpenDocumentMetadataExtracter extends AbstractMappingMetadataExtrac MimetypeMap.MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_MASTER, MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_WEB, - MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE }; + MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE + }, new OpenDocumentParser() + ); private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss"); public OpenDocumentMetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); + } + + @Override + protected Parser getParser() { + return new OpenDocumentParser(); } @Override - public Map extractRaw(ContentReader reader) throws Throwable - { - Map rawProperties = newRawMap(); - - InputStream is = null; - try - { - is = reader.getContentInputStream(); - - OpenDocumentParser docParser = new OpenDocumentParser(); - ContentHandler handler = new BodyContentHandler() ; - Metadata metadata = new Metadata(); - ParseContext context = new ParseContext(); - - docParser.parse(is, handler, metadata, context); - - putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), rawProperties); - putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), rawProperties); - putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), rawProperties); - putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), rawProperties); - putRawValue(KEY_GENERATOR, metadata.get("generator"), rawProperties); - putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), rawProperties); - putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), rawProperties); - putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), rawProperties); -// putRawValue(KEY_PRINT_DATE, getDateOrNull(metadata.get(Metadata.)), rawProperties); -// putRawValue(KEY_PRINTED_BY, metadata.get(Metadata.), rawProperties); - putRawValue(KEY_SUBJECT, metadata.get(Metadata.SUBJECT), rawProperties); - putRawValue(KEY_TITLE, metadata.get(Metadata.TITLE), rawProperties); - - // Handle user-defined properties dynamically - Map> mapping = super.getMapping(); - for (String key : mapping.keySet()) - { - if (metadata.get(CUSTOM_PREFIX + key) != null) - { - putRawValue(key, metadata.get(CUSTOM_PREFIX + key), rawProperties); - } - } - } - finally - { - if (is != null) - { - try { is.close(); } catch (IOException e) {} - } - } - // Done - return rawProperties; + protected Map extractSpecific(Metadata metadata, + Map properties, Map headers) { + putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties); + putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties); + putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties); + putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties); + putRawValue(KEY_GENERATOR, metadata.get("generator"), properties); + putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties); + putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); + putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties); +// putRawValue(KEY_PRINT_DATE, getDateOrNull(metadata.get(Metadata.)), rawProperties); +// putRawValue(KEY_PRINTED_BY, metadata.get(Metadata.), rawProperties); + + // Handle user-defined properties dynamically + Map> mapping = super.getMapping(); + for (String key : mapping.keySet()) + { + if (metadata.get(CUSTOM_PREFIX + key) != null) + { + putRawValue(key, metadata.get(CUSTOM_PREFIX + key), properties); + } + } + + return properties; } - - private Date getDateOrNull(String dateString) throws ParseException + private Date getDateOrNull(String dateString) { if (dateString != null && dateString.length() != 0) { - return dateFormat.parse(dateString); + try { + return dateFormat.parse(dateString); + } catch(ParseException e) {} } return null; diff --git a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java index 0d54f017e3..872888cb67 100644 --- a/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/OpenOfficeMetadataExtracter.java @@ -35,8 +35,8 @@ import org.alfresco.util.PropertyCheck; * description: -- cm:description * * - * TIKA Note - this probably won't be ported to TIKA. There's currently - * no support for these old formats in tika. + * Note - not converted to Apache Tika, as currently Tika + * lacks support for these older formats * * @author Jesper Steen Møller */ diff --git a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java index 3204dd8cfc..0ee0db9c03 100644 --- a/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/PdfBoxMetadataExtracter.java @@ -18,22 +18,13 @@ */ package org.alfresco.repo.content.metadata; -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.Calendar; -import java.util.Date; -import java.util.HashSet; -import java.util.Map; +import java.util.ArrayList; import org.alfresco.repo.content.MimetypeMap; -import org.alfresco.service.cmr.repository.ContentReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.pdf.PDFParser; /** * Metadata extractor for the PDF documents. @@ -42,115 +33,31 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation; * title: -- cm:title * subject: -- cm:description * created: -- cm:created - * Any custom property: -- [not mapped] * * - * TIKA Note - all the fields (plus a few others) are present - * in the tika metadata. + * Uses Apache Tika + * + * TODO - Update Tika to handle custom metadata * * @author Jesper Steen Møller * @author Derek Hulley */ -public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter +public class PdfBoxMetadataExtracter extends TikaPoweredMetadataExtracter { protected static Log pdfLogger = LogFactory.getLog(PdfBoxMetadataExtracter.class); - private static final String KEY_AUTHOR = "author"; - private static final String KEY_TITLE = "title"; - private static final String KEY_SUBJECT = "subject"; - private static final String KEY_CREATED = "created"; - - public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_PDF }; + public static ArrayList SUPPORTED_MIMETYPES = buildSupportedMimetypes( + new String[] { MimetypeMap.MIMETYPE_PDF }, + new PDFParser() + ); public PdfBoxMetadataExtracter() { - super(new HashSet(Arrays.asList(SUPPORTED_MIMETYPES))); + super(SUPPORTED_MIMETYPES); } @Override - public Map extractRaw(ContentReader reader) throws Throwable - { - Map rawProperties = newRawMap(); - - PDDocument pdf = null; - InputStream is = null; - try - { - is = reader.getContentInputStream(); - // stream the document in - pdf = PDDocument.load(is); - if (!pdf.isEncrypted()) - { - // Scoop out the metadata - PDDocumentInformation docInfo = pdf.getDocumentInformation(); - - putRawValue(KEY_AUTHOR, docInfo.getAuthor(), rawProperties); - putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties); - putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties); - - try - { - Calendar created = docInfo.getCreationDate(); - if (created != null) - { - // Work around https://issues.apache.org/jira/browse/PDFBOX-598 - created.set(Calendar.MILLISECOND, 0); - - // Save - putRawValue(KEY_CREATED, created.getTime(), rawProperties); - } - } - catch (IOException iox) - { - // This sometimes fails because the date is a string: ETHREEOH-1936 - // Alfresco bug ETHREEOH-801 refers to a bug in PDFBox (http://issues.apache.org/jira/browse/PDFBOX-145) - // where the above call to docInfo.getCreationDate() throws an IOException for some PDFs. - // - // The code below is a workaround for that issue. - - // This creationDate has format: D:20080429+01'00' - String creationDate = docInfo.getCustomMetadataValue("CreationDate"); - - if (pdfLogger.isWarnEnabled()) - { - pdfLogger.warn("IOException caught when extracting metadata from pdf file."); - pdfLogger.warn("This may be caused by a PDFBox bug that can often be worked around. The stack trace below is provided for information purposes only."); - pdfLogger.warn("", iox); - } - - final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); - if (creationDate != null && creationDate.length() > 10) // 10 allows for "D:yyyyMMdd" - { - String dateWithoutLeadingDColon = creationDate.substring(2); - Date parsedDate = sdf.parse(dateWithoutLeadingDColon); - putRawValue(KEY_CREATED, parsedDate, rawProperties); - } - } - // Extract remaining custom properties - for (String customProp : super.getMapping().keySet()) - { - if (rawProperties.keySet().contains(customProp)) - { - // Ignore it - continue; - } - String customValue = docInfo.getCustomMetadataValue(customProp); - putRawValue(customProp, customValue, rawProperties); - } - } - } - finally - { - if (is != null) - { - try { is.close(); } catch (IOException e) {} - } - if (pdf != null) - { - try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); } - } - } - // Done - return rawProperties; + protected Parser getParser() { + return new PDFParser(); } } diff --git a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java index 70a19482f5..bf8ba991e9 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaAutoMetadataExtracterTest.java @@ -30,8 +30,9 @@ import java.util.Map; import java.util.Set; import org.alfresco.model.ContentModel; -import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.repo.content.transform.AbstractContentTransformerTest; +import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.datatype.DefaultTypeConverter; import org.alfresco.service.namespace.QName; import org.apache.tika.metadata.Metadata; @@ -39,7 +40,6 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.dwg.DWGParser; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.parser.mp3.Mp3Parser; @@ -181,5 +181,53 @@ public class TikaAutoMetadataExtracterTest extends AbstractMetadataExtracterTest // mimetype, // DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY))); } - + + /** + * We don't have explicit extractors for most image and video formats. + * Instead, these will be handled by the Auto Tika Parser, and + * this test ensures that they are + */ + public void testImageVideo() throws Throwable { + Map p; + + // Image + p = openAndCheck(".jpg", "image/jpeg"); + assertEquals("409 pixels", p.get("Image Width")); + assertEquals("92 pixels", p.get("Image Height")); + assertEquals("8 bits", p.get("Data Precision")); + + p = openAndCheck(".gif", "image/gif"); + assertEquals("409", p.get("width")); + assertEquals("92", p.get("height")); + + p = openAndCheck(".png", "image/png"); + assertEquals("409", p.get("width")); + assertEquals("92", p.get("height")); + assertEquals("8 8 8", p.get("Data BitsPerSample")); + assertEquals("none", p.get("Transparency Alpha")); + + p = openAndCheck(".bmp", "image/bmp"); + assertEquals("409", p.get("width")); + assertEquals("92", p.get("height")); + assertEquals("8 8 8", p.get("Data BitsPerSample")); + } + private Map openAndCheck(String fileBase, String expMimeType) throws Throwable { + String filename = "quick" + fileBase; + URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename); + File file = new File(url.getFile()); + + // Cheat and ask Tika for the mime type! + AutoDetectParser ap = new AutoDetectParser(); + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, filename); + MediaType mt = ap.getDetector().detect( + new BufferedInputStream(new FileInputStream(file)), metadata); + String mimetype = mt.toString(); + + assertEquals("Wrong mimetype for " + fileBase, mimetype, expMimeType); + + ContentReader sourceReader = new FileContentReader(file); + sourceReader.setMimetype(mimetype); + return extracter.extractRaw(sourceReader); + } } diff --git a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java index e412f33abb..11d4b0d96f 100644 --- a/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java +++ b/source/java/org/alfresco/repo/content/metadata/TikaPoweredMetadataExtracter.java @@ -26,6 +26,7 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.Locale; import java.util.Map; @@ -35,11 +36,17 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.sax.xpath.Matcher; +import org.apache.tika.sax.xpath.MatchingContentHandler; +import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; /** * The parent of all Metadata Extractors which use @@ -145,11 +152,20 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada */ protected abstract Parser getParser(); + /** + * Do we care about the contents of the + * extracted header, or nothing at all? + */ + protected boolean needHeaderContents() { + return false; + } + /** * Allows implementation specific mappings * to be done. */ - protected Map extractSpecific(Metadata metadata, Map properties) { + protected Map extractSpecific(Metadata metadata, + Map properties, Map headers) { return properties; } @@ -163,9 +179,19 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada { is = reader.getContentInputStream(); Parser parser = getParser(); - ContentHandler handler = new BodyContentHandler() ; Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); + + ContentHandler handler; + Map headers = null; + if(needHeaderContents()) { + MapCaptureContentHandler headerCapture = + new MapCaptureContentHandler(); + headers = headerCapture.tags; + handler = new HeadContentHandler(headerCapture); + } else { + handler = new NullContentHandler(); + } parser.parse(is, handler, metadata, context); @@ -213,7 +239,7 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada // instance to map the Tika keys onto its // existing namespace so that older properties // files continue to map correctly - rawProperties = extractSpecific(metadata, rawProperties); + rawProperties = extractSpecific(metadata, rawProperties, headers); } finally { @@ -225,4 +251,123 @@ public abstract class TikaPoweredMetadataExtracter extends AbstractMappingMetada return rawProperties; } + + /** + * This content handler will capture entries from within + * the header of the Tika content XHTML, but ignore the + * rest. + */ + protected static class HeadContentHandler extends ContentHandlerDecorator { + /** + * XHTML XPath parser. + */ + private static final XPathParser PARSER = + new XPathParser("xhtml", XHTMLContentHandler.XHTML); + + /** + * The XPath matcher used to select the XHTML body contents. + */ + private static final Matcher MATCHER = + PARSER.parse("/xhtml:html/xhtml:head/descendant:node()"); + + /** + * Creates a content handler that passes all XHTML body events to the + * given underlying content handler. + * + * @param handler content handler + */ + protected HeadContentHandler(ContentHandler handler) { + super(new MatchingContentHandler(handler, MATCHER)); + } + } + /** + * This content handler will grab all tags and attributes, + * and record the textual content of the last seen one + * of them. + * Normally only used with {@link HeadContentHandler} + */ + protected static class MapCaptureContentHandler implements ContentHandler { + protected Map tags = + new HashMap(); + private StringBuffer text; + + @Override + public void characters(char[] ch, int start, int len) { + if(text != null) { + text.append(ch, start, len); + } + } + @Override + public void endElement(String namespace, String localname, + String qname) { + if(text != null && text.length() > 0) { + tags.put(qname, text.toString()); + } + text = null; + } + @Override + public void startElement(String namespace, String localname, + String qname, Attributes attrs) { + for(int i=0; i