mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-31 17:39:05 +00:00
Complete initial Tika-ification of the metadata extractor
The remaining extractors to be converted to Tika now have been, tests have been included for the image metadata extraction, and some extension points for future extractors have been created. git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20669 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -18,22 +18,13 @@
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
|
||||
/**
|
||||
* Metadata extractor for the PDF documents.
|
||||
@@ -42,115 +33,31 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>Any custom property:</b> -- [not mapped]
|
||||
* </pre>
|
||||
*
|
||||
* TIKA Note - all the fields (plus a few others) are present
|
||||
* in the tika metadata.
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* TODO - Update Tika to handle custom metadata
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
*/
|
||||
public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter
|
||||
public class PdfBoxMetadataExtracter extends TikaPoweredMetadataExtracter
|
||||
{
|
||||
protected static Log pdfLogger = LogFactory.getLog(PdfBoxMetadataExtracter.class);
|
||||
|
||||
private static final String KEY_AUTHOR = "author";
|
||||
private static final String KEY_TITLE = "title";
|
||||
private static final String KEY_SUBJECT = "subject";
|
||||
private static final String KEY_CREATED = "created";
|
||||
|
||||
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_PDF };
|
||||
public static ArrayList<String> SUPPORTED_MIMETYPES = buildSupportedMimetypes(
|
||||
new String[] { MimetypeMap.MIMETYPE_PDF },
|
||||
new PDFParser()
|
||||
);
|
||||
|
||||
public PdfBoxMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
||||
super(SUPPORTED_MIMETYPES);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
||||
{
|
||||
Map<String, Serializable> rawProperties = newRawMap();
|
||||
|
||||
PDDocument pdf = null;
|
||||
InputStream is = null;
|
||||
try
|
||||
{
|
||||
is = reader.getContentInputStream();
|
||||
// stream the document in
|
||||
pdf = PDDocument.load(is);
|
||||
if (!pdf.isEncrypted())
|
||||
{
|
||||
// Scoop out the metadata
|
||||
PDDocumentInformation docInfo = pdf.getDocumentInformation();
|
||||
|
||||
putRawValue(KEY_AUTHOR, docInfo.getAuthor(), rawProperties);
|
||||
putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties);
|
||||
putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties);
|
||||
|
||||
try
|
||||
{
|
||||
Calendar created = docInfo.getCreationDate();
|
||||
if (created != null)
|
||||
{
|
||||
// Work around https://issues.apache.org/jira/browse/PDFBOX-598
|
||||
created.set(Calendar.MILLISECOND, 0);
|
||||
|
||||
// Save
|
||||
putRawValue(KEY_CREATED, created.getTime(), rawProperties);
|
||||
}
|
||||
}
|
||||
catch (IOException iox)
|
||||
{
|
||||
// This sometimes fails because the date is a string: ETHREEOH-1936
|
||||
// Alfresco bug ETHREEOH-801 refers to a bug in PDFBox (http://issues.apache.org/jira/browse/PDFBOX-145)
|
||||
// where the above call to docInfo.getCreationDate() throws an IOException for some PDFs.
|
||||
//
|
||||
// The code below is a workaround for that issue.
|
||||
|
||||
// This creationDate has format: D:20080429+01'00'
|
||||
String creationDate = docInfo.getCustomMetadataValue("CreationDate");
|
||||
|
||||
if (pdfLogger.isWarnEnabled())
|
||||
{
|
||||
pdfLogger.warn("IOException caught when extracting metadata from pdf file.");
|
||||
pdfLogger.warn("This may be caused by a PDFBox bug that can often be worked around. The stack trace below is provided for information purposes only.");
|
||||
pdfLogger.warn("", iox);
|
||||
}
|
||||
|
||||
final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
|
||||
if (creationDate != null && creationDate.length() > 10) // 10 allows for "D:yyyyMMdd"
|
||||
{
|
||||
String dateWithoutLeadingDColon = creationDate.substring(2);
|
||||
Date parsedDate = sdf.parse(dateWithoutLeadingDColon);
|
||||
putRawValue(KEY_CREATED, parsedDate, rawProperties);
|
||||
}
|
||||
}
|
||||
// Extract remaining custom properties
|
||||
for (String customProp : super.getMapping().keySet())
|
||||
{
|
||||
if (rawProperties.keySet().contains(customProp))
|
||||
{
|
||||
// Ignore it
|
||||
continue;
|
||||
}
|
||||
String customValue = docInfo.getCustomMetadataValue(customProp);
|
||||
putRawValue(customProp, customValue, rawProperties);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (IOException e) {}
|
||||
}
|
||||
if (pdf != null)
|
||||
{
|
||||
try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
|
||||
}
|
||||
}
|
||||
// Done
|
||||
return rawProperties;
|
||||
protected Parser getParser() {
|
||||
return new PDFParser();
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user