diff --git a/config/alfresco/content-services-context.xml b/config/alfresco/content-services-context.xml index 59b45df533..22a27b550f 100644 --- a/config/alfresco/content-services-context.xml +++ b/config/alfresco/content-services-context.xml @@ -94,9 +94,12 @@ - + - + + + + diff --git a/config/alfresco/core-services-context.xml b/config/alfresco/core-services-context.xml index 5bcbdd568d..b8aa3cb7d2 100644 --- a/config/alfresco/core-services-context.xml +++ b/config/alfresco/core-services-context.xml @@ -143,6 +143,21 @@ false + + + + + UTF-8 + + + + + + + + + + diff --git a/config/alfresco/mimetype/mimetype-map.xml b/config/alfresco/mimetype/mimetype-map.xml index 9e4926bac2..7b43ada79f 100644 --- a/config/alfresco/mimetype/mimetype-map.xml +++ b/config/alfresco/mimetype/mimetype-map.xml @@ -2,7 +2,7 @@ - + txt csv java @@ -15,14 +15,14 @@ sh log - + html htm shtml body - + xhtml @@ -77,7 +77,7 @@ cgm - + class @@ -86,7 +86,7 @@ csh - + css @@ -95,7 +95,7 @@ wpd - + xml dtd xslt @@ -104,7 +104,7 @@ dvi - + etx @@ -125,7 +125,7 @@ hqx - + ics @@ -142,10 +142,10 @@ svg - + js - + latex @@ -229,10 +229,10 @@ sgml sgm - + sh - + shar @@ -264,7 +264,7 @@ tiff tif - + tsv diff --git a/config/alfresco/model/contentModel.xml b/config/alfresco/model/contentModel.xml index c4b50f7dae..91b7857084 100644 --- a/config/alfresco/model/contentModel.xml +++ b/config/alfresco/model/contentModel.xml @@ -16,7 +16,7 @@ - \<\?\/\:\|\xA3\xAC\%\&\+\;]+.*)|(.*[\.]?.*[\.]+$)|(.*[ ]+$)]]> + \<\?\/\:\|]+.*)|(.*[\.]?.*[\.]+$)|(.*[ ]+$)]]> false diff --git a/config/alfresco/network-protocol-context.xml b/config/alfresco/network-protocol-context.xml index 862e3d3789..3d44d836f7 100644 --- a/config/alfresco/network-protocol-context.xml +++ b/config/alfresco/network-protocol-context.xml @@ -58,6 +58,7 @@ + diff --git a/source/java/org/alfresco/filesys/smb/server/repo/ContentDiskDriver.java b/source/java/org/alfresco/filesys/smb/server/repo/ContentDiskDriver.java index 043dc18be9..bdd2c99006 100644 --- a/source/java/org/alfresco/filesys/smb/server/repo/ContentDiskDriver.java +++ b/source/java/org/alfresco/filesys/smb/server/repo/ContentDiskDriver.java @@ -68,6 +68,7 @@ import org.alfresco.repo.security.authentication.AuthenticationComponent; import org.alfresco.service.cmr.lock.NodeLockedException; import org.alfresco.service.cmr.model.FileFolderService; import org.alfresco.service.cmr.repository.ContentService; +import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.NodeRef; import org.alfresco.service.cmr.repository.NodeService; import org.alfresco.service.cmr.repository.StoreRef; @@ -111,6 +112,7 @@ public class ContentDiskDriver extends AlfrescoDiskDriver implements DiskInterfa private NodeService nodeService; private SearchService searchService; private ContentService contentService; + private MimetypeService mimetypeService; private PermissionService permissionService; private FileFolderService fileFolderService; @@ -280,6 +282,14 @@ public class ContentDiskDriver extends AlfrescoDiskDriver implements DiskInterfa fileFolderService = fileService; } + /** + * @param mimetypeService service for helping with mimetypes and encoding + */ + public void setMimetypeService(MimetypeService mimetypeService) + { + this.mimetypeService = mimetypeService; + } + /** * Parse and validate the parameter string and create a device context object for this instance * of the shared device. The same DeviceInterface implementation may be used for multiple @@ -1240,7 +1250,7 @@ public class ContentDiskDriver extends AlfrescoDiskDriver implements DiskInterfa { // Create the network file - netFile = ContentNetworkFile.createFile(transactionService, nodeService, contentService, cifsHelper, nodeRef, params); + netFile = ContentNetworkFile.createFile(nodeService, contentService, mimetypeService, cifsHelper, nodeRef, params); } else { @@ -1406,7 +1416,7 @@ public class ContentDiskDriver extends AlfrescoDiskDriver implements DiskInterfa // Create the network file - NetworkFile netFile = ContentNetworkFile.createFile(transactionService, nodeService, contentService, cifsHelper, nodeRef, params); + NetworkFile netFile = ContentNetworkFile.createFile(nodeService, contentService, mimetypeService, cifsHelper, nodeRef, params); // Truncate the file so that the content stream is created diff --git a/source/java/org/alfresco/filesys/smb/server/repo/ContentNetworkFile.java b/source/java/org/alfresco/filesys/smb/server/repo/ContentNetworkFile.java index 535a727e56..3bdf0778fc 100644 --- a/source/java/org/alfresco/filesys/smb/server/repo/ContentNetworkFile.java +++ b/source/java/org/alfresco/filesys/smb/server/repo/ContentNetworkFile.java @@ -24,10 +24,14 @@ */ package org.alfresco.filesys.smb.server.repo; +import java.io.BufferedInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; +import java.nio.channels.Channels; import java.nio.channels.FileChannel; +import java.nio.charset.Charset; import org.alfresco.error.AlfrescoRuntimeException; import org.alfresco.filesys.server.filesys.AccessDeniedException; @@ -38,15 +42,16 @@ import org.alfresco.filesys.server.filesys.NetworkFile; import org.alfresco.filesys.smb.SeekType; import org.alfresco.i18n.I18NUtil; import org.alfresco.model.ContentModel; +import org.alfresco.repo.content.encoding.ContentCharsetFinder; import org.alfresco.repo.content.filestore.FileContentReader; import org.alfresco.service.cmr.repository.ContentAccessor; import org.alfresco.service.cmr.repository.ContentData; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.ContentService; import org.alfresco.service.cmr.repository.ContentWriter; +import org.alfresco.service.cmr.repository.MimetypeService; import org.alfresco.service.cmr.repository.NodeRef; import org.alfresco.service.cmr.repository.NodeService; -import org.alfresco.service.transaction.TransactionService; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -62,9 +67,9 @@ public class ContentNetworkFile extends NodeRefNetworkFile { private static final Log logger = LogFactory.getLog(ContentNetworkFile.class); - private TransactionService transactionService; private NodeService nodeService; private ContentService contentService; + private MimetypeService mimetypeService; // File channel to file content @@ -86,9 +91,9 @@ public class ContentNetworkFile extends NodeRefNetworkFile * Helper method to create a {@link NetworkFile network file} given a node reference. */ public static ContentNetworkFile createFile( - TransactionService transactionService, NodeService nodeService, ContentService contentService, + MimetypeService mimetypeService, CifsHelper cifsHelper, NodeRef nodeRef, FileOpenParams params) @@ -100,7 +105,7 @@ public class ContentNetworkFile extends NodeRefNetworkFile // Create the file - ContentNetworkFile netFile = new ContentNetworkFile(transactionService, nodeService, contentService, nodeRef, path); + ContentNetworkFile netFile = new ContentNetworkFile(nodeService, contentService, mimetypeService, nodeRef, path); // Set relevant parameters @@ -176,17 +181,17 @@ public class ContentNetworkFile extends NodeRefNetworkFile * @param name String */ private ContentNetworkFile( - TransactionService transactionService, NodeService nodeService, ContentService contentService, + MimetypeService mimetypeService, NodeRef nodeRef, String name) { super(name, nodeRef); setFullName(name); - this.transactionService = transactionService; this.nodeService = nodeService; this.contentService = contentService; + this.mimetypeService = mimetypeService; } /** @@ -362,6 +367,13 @@ public class ContentNetworkFile extends NodeRefNetworkFile if (modified) { + // Take a guess at the mimetype + channel.position(0); + InputStream is = new BufferedInputStream(Channels.newInputStream(channel)); + ContentCharsetFinder charsetFinder = mimetypeService.getContentCharsetFinder(); + Charset charset = charsetFinder.getCharset(is, content.getMimetype()); + content.setEncoding(charset.name()); + // Close the channel channel.close(); diff --git a/source/java/org/alfresco/repo/content/ContentTestSuite.java b/source/java/org/alfresco/repo/content/ContentTestSuite.java index e53e4eed39..acb22bd42d 100644 --- a/source/java/org/alfresco/repo/content/ContentTestSuite.java +++ b/source/java/org/alfresco/repo/content/ContentTestSuite.java @@ -25,6 +25,7 @@ package org.alfresco.repo.content; import org.alfresco.repo.content.cleanup.ContentStoreCleanerTest; +import org.alfresco.repo.content.encoding.CharsetFinderTest; import org.alfresco.repo.content.filestore.FileContentStoreTest; import org.alfresco.repo.content.filestore.NoRandomAccessFileContentStoreTest; import org.alfresco.repo.content.filestore.ReadOnlyFileContentStoreTest; @@ -63,6 +64,7 @@ public class ContentTestSuite extends TestSuite TestSuite suite = new TestSuite(); suite.addTestSuite(ContentStoreCleanerTest.class); + suite.addTestSuite(CharsetFinderTest.class); suite.addTestSuite(FileContentStoreTest.class); suite.addTestSuite(NoRandomAccessFileContentStoreTest.class); suite.addTestSuite(ReadOnlyFileContentStoreTest.class); diff --git a/source/java/org/alfresco/repo/content/MimetypeMap.java b/source/java/org/alfresco/repo/content/MimetypeMap.java index 003b005242..2bd1b820bb 100644 --- a/source/java/org/alfresco/repo/content/MimetypeMap.java +++ b/source/java/org/alfresco/repo/content/MimetypeMap.java @@ -27,15 +27,19 @@ package org.alfresco.repo.content; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.alfresco.config.Config; import org.alfresco.config.ConfigElement; import org.alfresco.config.ConfigLookupContext; import org.alfresco.config.ConfigService; import org.alfresco.error.AlfrescoRuntimeException; +import org.alfresco.repo.content.encoding.ContentCharsetFinder; import org.alfresco.service.cmr.repository.MimetypeService; +import org.alfresco.util.PropertyCheck; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -48,6 +52,7 @@ import org.apache.commons.logging.LogFactory; */ public class MimetypeMap implements MimetypeService { + public static final String PREFIX_TEXT = "text/"; public static final String EXTENSION_BINARY = "bin"; public static final String MIMETYPE_TEXT_PLAIN = "text/plain"; @@ -118,35 +123,76 @@ public class MimetypeMap implements MimetypeService private static final String ATTR_MIMETYPE = "mimetype"; private static final String ATTR_DISPLAY = "display"; private static final String ATTR_DEFAULT = "default"; + private static final String ATTR_TEXT = "text"; private static final Log logger = LogFactory.getLog(MimetypeMap.class); private ConfigService configService; + private ContentCharsetFinder contentCharsetFinder; private List mimetypes; private Map extensionsByMimetype; private Map mimetypesByExtension; private Map displaysByMimetype; private Map displaysByExtension; + private Set textMimetypes; /** - * @param configService the config service to use to read mimetypes from + * Default constructor + * + * @since 2.1 */ + public MimetypeMap() + { + } + + @Deprecated public MimetypeMap(ConfigService configService) { + logger.warn( + "MimetypeMap(ConfigService configService) has been deprecated. " + + "Use the default constructor and property 'configService'"); this.configService = configService; - } + } + /** + * @param configService the config service to use to read mimetypes from + */ + public void setConfigService(ConfigService configService) + { + this.configService = configService; + } + + /** + * {@inheritDoc} + */ + public ContentCharsetFinder getContentCharsetFinder() + { + return contentCharsetFinder; + } + + /** + * Set the system default content characterset decoder + */ + public void setContentCharsetFinder(ContentCharsetFinder contentCharsetFinder) + { + this.contentCharsetFinder = contentCharsetFinder; + } + /** * Initialises the map using the configuration service provided */ public void init() { + PropertyCheck.mandatory(this, "configService", configService); + PropertyCheck.mandatory(this, "contentCharsetFinder", contentCharsetFinder); + this.mimetypes = new ArrayList(40); this.extensionsByMimetype = new HashMap(59); this.mimetypesByExtension = new HashMap(59); this.displaysByMimetype = new HashMap(59); this.displaysByExtension = new HashMap(59); + this.textMimetypes = new HashSet(23); Config config = configService.getConfig(CONFIG_CONDITION, new ConfigLookupContext(CONFIG_AREA)); ConfigElement mimetypesElement = config.getConfigElement(ELEMENT_MIMETYPES); @@ -175,6 +221,14 @@ public class MimetypeMap implements MimetypeService { this.displaysByMimetype.put(mimetype, mimetypeDisplay); } + + // Check if it is a text format + String isTextStr = mimetypeElement.getAttribute(ATTR_TEXT); + boolean isText = Boolean.parseBoolean(isTextStr); + if (isText || mimetype.startsWith(PREFIX_TEXT)) + { + this.textMimetypes.add(mimetype); + } // get all the extensions boolean isFirst = true; @@ -209,6 +263,7 @@ public class MimetypeMap implements MimetypeService { this.extensionsByMimetype.put(mimetype, extension); } + // Loop again isFirst = false; } // check that there were extensions defined @@ -274,6 +329,11 @@ public class MimetypeMap implements MimetypeService return mimetypesByExtension; } + public boolean isText(String mimetype) + { + return textMimetypes.contains(mimetype); + } + /** * @see #MIMETYPE_BINARY */ diff --git a/source/java/org/alfresco/repo/content/MimetypeMapTest.java b/source/java/org/alfresco/repo/content/MimetypeMapTest.java index 6c164bcef4..bc6a11ea59 100644 --- a/source/java/org/alfresco/repo/content/MimetypeMapTest.java +++ b/source/java/org/alfresco/repo/content/MimetypeMapTest.java @@ -75,4 +75,15 @@ public class MimetypeMapTest extends TestCase // Star Office assertEquals("sds", extensionsByMimetype.get("application/vnd.stardivision.chart")); } + + public void testIsText() throws Exception + { + assertTrue(mimetypeService.isText(MimetypeMap.MIMETYPE_HTML)); + } + + public void testGetContentCharsetFinder() throws Exception + { + assertNotNull("No charset finder", mimetypeService.getContentCharsetFinder()); + } + } diff --git a/source/java/org/alfresco/repo/content/encoding/CharsetFinderTest.java b/source/java/org/alfresco/repo/content/encoding/CharsetFinderTest.java new file mode 100644 index 0000000000..4ce28c5418 --- /dev/null +++ b/source/java/org/alfresco/repo/content/encoding/CharsetFinderTest.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2005-2007 Alfresco Software Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content.encoding; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.nio.charset.Charset; + +import junit.framework.TestCase; + +import org.alfresco.encoding.CharactersetFinder; +import org.alfresco.repo.content.MimetypeMap; +import org.alfresco.repo.content.transform.AbstractContentTransformerTest; +import org.alfresco.util.ApplicationContextHelper; +import org.springframework.context.ApplicationContext; + +/** + * @see CharsetFinderTest + * @see CharactersetFinder + * + * @author Derek Hulley + */ +public class CharsetFinderTest extends TestCase +{ + private static ApplicationContext ctx = ApplicationContextHelper.getApplicationContext(); + + private ContentCharsetFinder charsetFinder; + + @Override + public void setUp() throws Exception + { + charsetFinder = (ContentCharsetFinder) ctx.getBean("charset.finder"); + } + + public void testPlainText() throws Exception + { + File file = AbstractContentTransformerTest.loadQuickTestFile("txt"); + InputStream is = new BufferedInputStream(new FileInputStream(file)); + Charset charset = charsetFinder.getCharset(is, MimetypeMap.MIMETYPE_TEXT_PLAIN); + assertNotNull(charset); + } +} diff --git a/source/java/org/alfresco/repo/content/encoding/ContentCharsetFinder.java b/source/java/org/alfresco/repo/content/encoding/ContentCharsetFinder.java new file mode 100644 index 0000000000..2e3686c905 --- /dev/null +++ b/source/java/org/alfresco/repo/content/encoding/ContentCharsetFinder.java @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2005-2007 Alfresco Software Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + * As a special exception to the terms and conditions of version 2.0 of + * the GPL, you may redistribute this Program in connection with Free/Libre + * and Open Source Software ("FLOSS") applications as described in Alfresco's + * FLOSS exception. You should have recieved a copy of the text describing + * the FLOSS exception, and it is also available here: + * http://www.alfresco.com/legal/licensing" + */ +package org.alfresco.repo.content.encoding; + +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.List; + +import org.alfresco.encoding.CharactersetFinder; +import org.alfresco.service.cmr.repository.MimetypeService; + +/** + * Utility bean to guess the charset given a stream and a mimetype. + * + * @since 2.1 + * @author Derek Hulley + */ +public class ContentCharsetFinder +{ + private Charset defaultCharset = Charset.defaultCharset(); + private MimetypeService mimetypeService; + private List charactersetFinders; + + /** + * Override the system default charset. Where the characterset cannot be determined for + * a mimetype and input stream, this mimetype will be used. The default is 'UTF-8'. + * + * @param defaultCharset the default characterset + */ + public void setDefaultCharset(String defaultCharset) + { + this.defaultCharset = Charset.forName(defaultCharset); + } + + /** + * Set the mimetype service that will help determine if a particular mimetype can be + * treated as encoded text or not. + */ + public void setMimetypeService(MimetypeService mimetypeService) + { + this.mimetypeService = mimetypeService; + } + + /** + * Set the list of characterset finder to execute, in order, for text based content. + * @param charactersetFinders a list of finders + */ + public void setCharactersetFinders(List charactersetFinders) + { + this.charactersetFinders = charactersetFinders; + } + + /** + * Gets the characterset from the stream, if the mimetype is text and the text + * has enough information to give the encoding away. Otherwise, the default + * is returned. + * + * @param is a stream that will not be affected by the call, but must + * support marking + * @param mimetype the mimetype of the stream data + * @return returns a characterset and never null + */ + public Charset getCharset(InputStream is, String mimetype) + { + // Is it text? + if (!mimetypeService.isText(mimetype)) + { + return defaultCharset; + } + // Try the finders + Charset charset = null; + for (CharactersetFinder finder : charactersetFinders) + { + charset = finder.detectCharset(is); + if (charset != null) + { + break; + } + } + // Done + if (charset == null) + { + return defaultCharset; + } + else + { + return charset; + } + } +} diff --git a/source/java/org/alfresco/service/cmr/repository/MimetypeService.java b/source/java/org/alfresco/service/cmr/repository/MimetypeService.java index 969af1abb6..fd9b30cc3d 100644 --- a/source/java/org/alfresco/service/cmr/repository/MimetypeService.java +++ b/source/java/org/alfresco/service/cmr/repository/MimetypeService.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; import org.alfresco.error.AlfrescoRuntimeException; +import org.alfresco.repo.content.encoding.ContentCharsetFinder; import org.alfresco.service.NotAuditable; import org.alfresco.service.PublicService; @@ -82,6 +83,15 @@ public interface MimetypeService */ @NotAuditable public Map getMimetypesByExtension(); + + /** + * Check if a given mimetype represents a text format. + * + * @param mimetype the mimetype to check + * @return Returns true if it is text + */ + @NotAuditable + public boolean isText(String mimetype); /** * Get all mimetypes @@ -101,4 +111,15 @@ public interface MimetypeService */ @NotAuditable public String guessMimetype(String filename); + + /** + * Provides the system default charset finder. + * + * @return Returns a character set finder that can be used to decode + * streams in order to get the encoding. + * + * @since 2.1 + */ + @NotAuditable + public ContentCharsetFinder getContentCharsetFinder(); } diff --git a/source/test-resources/quick/quick.txt b/source/test-resources/quick/quick.txt index ff3bb63948..f89201fc87 100644 --- a/source/test-resources/quick/quick.txt +++ b/source/test-resources/quick/quick.txt @@ -1 +1,7 @@ -The quick brown fox jumps over the lazy dog \ No newline at end of file +The quick brown fox jumps over the lazy dog + +Le renard brun rapide saute par-dessus le chien paresseux + +Der schnelle braune Fuchs springt über den faulen Hund + +براون وكس السريع يقفز فوق الكلب كسالي \ No newline at end of file