mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-07 18:25:23 +00:00
16971: Merged V3.1 to V3.2 14282: (RECORD ONLY) Updated version to 3.1.1 14565: (RECORD ONLY) Updated version to include revision number (x.y.z) 14848: (RECORD ONLY) Updated version number 15029: (RECORD ONLY) ETHREEOH-441 and ETHREEOH-1862 Exceptions in server when certain PDFs are uploaded. - Probably done already without merge info - Renamed source zip to help tracking 15985: ETHREEOH-2292: Deployment failure in case of IPv6 on Win 2k8 16164: Fixed ETHREEOH-2690: JGroups TCP doesn't bind to specified address 16240: (RECORD ONLY) Fix typos in installer 16726: Fix ETHREEOH-2677 - user usages (when taking ownership) 16745: Fix ETHREEOH-2991 - Deployment exception, unable to deploy - when deploying a manual snapshot with a stale file 16771: (RECORD ONLY) Fix to ETHREEOH-441 - Probably merged without merge info 16822: Merged DEV/BELARUS/V3.1 to V3.1 16753: ETHREEOH-1951: when versionable aspect is active, using the Microsoft Word option ... 16825: Fixed ETHREEOH-803: Incorrect mimetype is displayed for .pps and .pot files 16862: Fix for ETHREEOH-801 Fail to extract some kind of PDF file metadata - Resolved merge by prefering merged-in fix 16880: Merged V2.2 to V3.1 13966: (record only) Updated to use ALF-BINARIES version of installjammer 14340: (record only) Fix so deployment installers build 14719: (record only) Tweaks to AMP 15153: (record only) Fix ETWOTWO-1264 - PHP integration 15287: Fixed ETWOTWO-989: MS Sql server upgrade from 2.1.6 failed 15351: ETWOTWO-1345 (script not matching patch ID) 16928: Fixed shutdown: Task threads are now daemon threads 16986: Merged V3.1 to V3.2 16932: Moved Lucene ResultSet prefetch code to use NodeBulkLoader (backed by common code in Node DAO) 16945: (RECORD ONLY) Merged V3.2 to V3.1 16931: Fixed build unit test path for recent Chiba lib change 16957: Removed Hibernate event listener after 3.2.3 CGLib fixes 16959: Applied fix for ETHREEOH-2121: ContentUtils.getContentAsString does not pass JSESSIONID into the request 16961: (RECORD ONLY) Updated db settings in configs 16964: (RECORD ONLY) Merged V3.2 to V3.1 16308: ETHREEOH-2833: The Content rule with 'Items with specific text value in property' condition can't be created. 16968: Fixed ETHREEOH-2120: Recently Modified Documents Dashlet failed to load after a big upload 16983: Build fix: avoid queries for parent assocs if no nodes were found during child node in caching 16991: Merge V3.1 to V3.2 15136: (RECORD ONLY) : changes have already been merged. MERGE 2.2 to 3.1 14985 - ETWOTWO-1174 - Preview of protected PDFs results in an error 14305 - ETWOTWO-951 - contribution 14601 - ETWOTWO-1236 - Make FSR deployment case sensitive. 17022: MT - fix fallout from r16924 - add inbound collection support to MT node service interceptor (caught by MultiTDemoTest.testCreateGroups) 17023: Fixed parentAssocCache bug when adding assocs against an empty cache git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@17025 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
144 lines
5.8 KiB
Java
144 lines
5.8 KiB
Java
/*
|
|
* Copyright (C) 2005 Jesper Steen Møller
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
* As a special exception to the terms and conditions of version 2.0 of
|
|
* the GPL, you may redistribute this Program in connection with Free/Libre
|
|
* and Open Source Software ("FLOSS") applications as described in Alfresco's
|
|
* FLOSS exception. You should have recieved a copy of the text describing
|
|
* the FLOSS exception, and it is also available here:
|
|
* http://www.alfresco.com/legal/licensing"
|
|
*/
|
|
package org.alfresco.repo.content.metadata;
|
|
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.Serializable;
|
|
import java.text.SimpleDateFormat;
|
|
import java.util.Arrays;
|
|
import java.util.Calendar;
|
|
import java.util.Date;
|
|
import java.util.HashSet;
|
|
import java.util.Map;
|
|
|
|
import org.alfresco.repo.content.MimetypeMap;
|
|
import org.alfresco.service.cmr.repository.ContentReader;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
|
|
|
/**
|
|
* Metadata extractor for the PDF documents.
|
|
* <pre>
|
|
* <b>author:</b> -- cm:author
|
|
* <b>title:</b> -- cm:title
|
|
* <b>subject:</b> -- cm:description
|
|
* <b>created:</b> -- cm:created
|
|
* </pre>
|
|
*
|
|
* @author Jesper Steen Møller
|
|
* @author Derek Hulley
|
|
*/
|
|
public class PdfBoxMetadataExtracter extends AbstractMappingMetadataExtracter
|
|
{
|
|
protected static Log pdfLogger = LogFactory.getLog(PdfBoxMetadataExtracter.class);
|
|
|
|
private static final String KEY_AUTHOR = "author";
|
|
private static final String KEY_TITLE = "title";
|
|
private static final String KEY_SUBJECT = "subject";
|
|
private static final String KEY_CREATED = "created";
|
|
|
|
public static String[] SUPPORTED_MIMETYPES = new String[] {MimetypeMap.MIMETYPE_PDF };
|
|
|
|
public PdfBoxMetadataExtracter()
|
|
{
|
|
super(new HashSet<String>(Arrays.asList(SUPPORTED_MIMETYPES)));
|
|
}
|
|
|
|
@Override
|
|
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
|
|
{
|
|
Map<String, Serializable> rawProperties = newRawMap();
|
|
|
|
PDDocument pdf = null;
|
|
InputStream is = null;
|
|
try
|
|
{
|
|
is = reader.getContentInputStream();
|
|
// stream the document in
|
|
pdf = PDDocument.load(is);
|
|
if (!pdf.isEncrypted())
|
|
{
|
|
// Scoop out the metadata
|
|
PDDocumentInformation docInfo = pdf.getDocumentInformation();
|
|
|
|
putRawValue(KEY_AUTHOR, docInfo.getAuthor(), rawProperties);
|
|
putRawValue(KEY_TITLE, docInfo.getTitle(), rawProperties);
|
|
putRawValue(KEY_SUBJECT, docInfo.getSubject(), rawProperties);
|
|
|
|
try
|
|
{
|
|
Calendar created = docInfo.getCreationDate();
|
|
if (created != null)
|
|
{
|
|
putRawValue(KEY_CREATED, created.getTime(), rawProperties);
|
|
}
|
|
}
|
|
catch (IOException iox)
|
|
{
|
|
// This sometimes fails because the date is a string: ETHREEOH-1936
|
|
// Alfresco bug ETHREEOH-801 refers to a bug in PDFBox (http://issues.apache.org/jira/browse/PDFBOX-145)
|
|
// where the above call to docInfo.getCreationDate() throws an IOException for some PDFs.
|
|
//
|
|
// The code below is a workaround for that issue.
|
|
|
|
// This creationDate has format: D:20080429+01'00'
|
|
String creationDate = docInfo.getCustomMetadataValue("CreationDate");
|
|
|
|
if (pdfLogger.isWarnEnabled())
|
|
{
|
|
pdfLogger.warn("IOException caught when extracting metadata from pdf file.");
|
|
pdfLogger.warn("This may be caused by a PDFBox bug that can often be worked around. The stack trace below is provided for information purposes only.");
|
|
pdfLogger.warn("", iox);
|
|
}
|
|
|
|
final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
|
|
if (creationDate != null && creationDate.length() > 10) // 10 allows for "D:yyyyMMdd"
|
|
{
|
|
String dateWithoutLeadingDColon = creationDate.substring(2);
|
|
Date parsedDate = sdf.parse(dateWithoutLeadingDColon);
|
|
putRawValue(KEY_CREATED, parsedDate, rawProperties);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
if (is != null)
|
|
{
|
|
try { is.close(); } catch (IOException e) {}
|
|
}
|
|
if (pdf != null)
|
|
{
|
|
try { pdf.close(); } catch (Throwable e) { e.printStackTrace(); }
|
|
}
|
|
}
|
|
// Done
|
|
return rawProperties;
|
|
}
|
|
}
|