Complete initial Tika-ification of the metadata extractor

The remaining extractors to be converted to Tika now have been, tests have
 been included for the image metadata extraction, and some extension points
 for future extractors have been created.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20669 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Nick Burch
2010-06-16 16:19:38 +00:00
parent 0e19812dbc
commit 62f07a8661
10 changed files with 295 additions and 252 deletions

View File

@@ -18,22 +18,13 @@
*/
package org.alfresco.repo.content.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.ArrayList;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
/**
* Metadata extractor for the PDF documents.
@@ -42,115 +33,31 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
* <b>title:</b> -- cm:title
* <b>subject:</b> -- cm:description
* <b>created:</b> -- cm:created
* <b>Any custom property:</b> -- [not mapped]
* </pre>
*
* TIKA Note - all the fields (plus a few others) are present
* in the tika metadata.
* Uses Apache Tika
*
* TODO - Update Tika to handle custom metadata
*
* @author Jesper Steen Møller
* @author Derek Hulley
*/
public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter
public class PdfBoxMetadataExtracter extends TikaPoweredMetadataExtracter
{
protected static Log pdfLogger = LogFactory.getLog(PdfBoxMetadataExtracter.class);
private static final String KEY_AUTHOR = "author";
private static final String KEY_TITLE = "title";
private static final String KEY_SUBJECT = "subject";
private static final String KEY_CREATED = "created";
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_PDF };
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
new String[] { MimetypeMap.MIMETYPE_PDF },
new PDFParser()
);
public PdfBoxMetadataExtracter()
{
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
super(SUPPORTED_MIMETYPES);
}
@Override
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
Map<String, Serializable> rawProperties = newRawMap();
PDDocument pdf = null;
InputStream is = null;
try
{
is = reader.getContentInputStream();
// stream the document in
pdf = PDDocument.load(is);
if (!pdf.isEncrypted())
{
// Scoop out the metadata
PDDocumentInformation docInfo = pdf.getDocumentInformation();
putRawValue(KEY_AUTHOR, docInfo.getAuthor(), rawProperties);
putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties);
putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties);
try
{
Calendar created = docInfo.getCreationDate();
if (created != null)
{
// Work around https://issues.apache.org/jira/browse/PDFBOX-598
created.set(Calendar.MILLISECOND, 0);
// Save
putRawValue(KEY_CREATED, created.getTime(), rawProperties);
}
}
catch (IOException iox)
{
// This sometimes fails because the date is a string: ETHREEOH-1936
// Alfresco bug ETHREEOH-801 refers to a bug in PDFBox (http://issues.apache.org/jira/browse/PDFBOX-145)
// where the above call to docInfo.getCreationDate() throws an IOException for some PDFs.
//
// The code below is a workaround for that issue.
// This creationDate has format: D:20080429+01'00'
String creationDate = docInfo.getCustomMetadataValue("CreationDate");
if (pdfLogger.isWarnEnabled())
{
pdfLogger.warn("IOException caught when extracting metadata from pdf file.");
pdfLogger.warn("This may be caused by a PDFBox bug that can often be worked around. The stack trace below is provided for information purposes only.");
pdfLogger.warn("", iox);
}
final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
if (creationDate != null && creationDate.length() > 10) // 10 allows for "D:yyyyMMdd"
{
String dateWithoutLeadingDColon = creationDate.substring(2);
Date parsedDate = sdf.parse(dateWithoutLeadingDColon);
putRawValue(KEY_CREATED, parsedDate, rawProperties);
}
}
// Extract remaining custom properties
for (String customProp : super.getMapping().keySet())
{
if (rawProperties.keySet().contains(customProp))
{
// Ignore it
continue;
}
String customValue = docInfo.getCustomMetadataValue(customProp);
putRawValue(customProp, customValue, rawProperties);
}
}
}
finally
{
if (is != null)
{
try { is.close(); } catch (IOException e) {}
}
if (pdf != null)
{
try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
}
}
// Done
return rawProperties;
protected Parser getParser() {
return new PDFParser();
}
}