diff --git a/config/alfresco/linkvalidation-service-context.xml b/config/alfresco/linkvalidation-service-context.xml index 72018b6a13..2206b40b2f 100644 --- a/config/alfresco/linkvalidation-service-context.xml +++ b/config/alfresco/linkvalidation-service-context.xml @@ -118,6 +118,103 @@ + + + + + + aaa + aaas + acap + afs + cap + cid + crid + data + dav + dict + dns + dtn + fax + file + ftp + go + gopher + h323 + + + iax2 + icap + im + imap + info + ipp + iris + iris.beep + iris.lwz + iris.xpc + iris.xpcs + ldap + mailserver + mailto + mid + modem + msrp + msrps + mtqp + mupdate + news + nfs + nntp + opaquelocktoken + pop + pres + prospero + rtsp + service + shttp + sip + sips + snmp + soap.beep + soap.beeps + tag + tel + telnet + tftp + thismessage + tip + tn3270 + tv + urn + vemmi + wais + xmlrpc.beep + xmlrpc.beeps + xmpp + z39.50r + z39.50s + + + + + + + @@ -133,12 +230,15 @@ - + + + + diff --git a/source/java/org/alfresco/repo/avm/util/UriSchemeNameMatcher.java b/source/java/org/alfresco/repo/avm/util/UriSchemeNameMatcher.java new file mode 100644 index 0000000000..546a5c6cd4 --- /dev/null +++ b/source/java/org/alfresco/repo/avm/util/UriSchemeNameMatcher.java @@ -0,0 +1,205 @@ +/*----------------------------------------------------------------------------- +* Copyright 2007 Alfresco Inc. +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* for more details. +* +* You should have received a copy of the GNU General Public License along +* with this program; if not, write to the Free Software Foundation, Inc., +* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. As a special +* exception to the terms and conditions of version 2.0 of the GPL, you may +* redistribute this Program in connection with Free/Libre and Open Source +* Software ("FLOSS") applications as described in Alfresco's FLOSS exception. +* You should have received a copy of the text describing the FLOSS exception, +* and it is also available here: http://www.alfresco.com/legal/licensing +* +* +* Author Jon Cox +* File UriSchemeNameMatcher.java +*----------------------------------------------------------------------------*/ + +package org.alfresco.repo.avm.util; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.List; + +import org.alfresco.util.NameMatcher; + +/** + * A NameMatcher that matches an incoming URL against list of schemes + * (less formally known as "protocols"), case insensitively. + * The formal spec for parsing URIs is RFC-3986 + *

+ * Perhaps someday, it might be worthwhile to create a specific + * parser for each registered scheme-specific part, and validate + * that; for now, we'll just be be more lax, and assume the URI + * is alwasy scheme-qualified. This matcher will look no further + * than the leading colon, and declare "no match" otherwise. + * The discussion below explains why. + *

+ * See: http://tools.ietf.org/html/rfc3986): + *

+ *  The following regex parses URIs:
+ *       ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *
+ *  Given the following URI:   
+ *        http://www.ics.uci.edu/pub/ietf/uri/#Related
+ *
+ *  The captured subexpressions are:
+ *
+ *        $1 = http:
+ *        $2 = http
+ *        $3 = //www.ics.uci.edu
+ *        $4 = www.ics.uci.edu
+ *        $5 = /pub/ietf/uri/
+ *        $6 = 
+ *        $7 = 
+ *        $8 = #Related
+ *        $9 = Related   
+ *
+ *   N0TE:
+ *      A URI can be non-scheme qualified because $1 is optional.  Therefore,
+ *      the following are all exaples of valid non-scheme qualified URIS:
+ *
+ *         ""
+ *         "moo@cow.com"
+ *         "moo@cow.com?wow"
+ *         "moo@cow.com?wow#zow"
+ *         "moo@cow.com#zow"
+ *         "/"
+ *         "/moo/cow"
+ *         "/moo/cow?wow"
+ *         "/moo/cow?wow#zow"
+ *         "/moo/cow#zow"
+ *         "//moo/cow"
+ *         "//moo.com/cow"
+ *         "//moo.com/cow/"
+ *         "//moo.com/cow?wow"
+ *         "//moo.com/cow?wow#zow"
+ *         "//moo.com/cow#zow"
+ *         "//moo.com:8080/cow"
+ *         "//moo.com:/cow"
+ *         "//moo.com:8080/cow?wow"
+ *         "//moo.com:8080/cow?wow#zow"
+ *         "//moo.com:8080/cow#zow"
+ *         "///moo/cow"
+ *         "///moo/cow?wow"
+ *         "///moo/cow?wow#zow"
+ *         "///moo/cow#zow"
+ *
+ *      And so forth...
+ *      
+ * 
+ *
+ *  Thus the business end of things as far as scheme matching is: $2,
+ *  Most schemes will have a $3 that starts with '//', but not all.
+ *  Specificially, the following have no "network path '//' segment,
+ *  or aren't required to (source: http://en.wikipedia.org/wiki/URI_scheme):
+ *  
+ *
+ *      cid data dns fax go h323 iax2 mailto mid news pres sip
+ *      sips tel urn xmpp about aim callto feed magnet msnim 
+ *      psyc skype sms stream xfire ymsgr
+ *
+ *  
+ * + * Visually the parts are as follows: + *
+ * 
+ *         foo://example.com:10042/over/there?name=ferret#nose
+ *         \_/   \_______________/\_________/ \_________/ \__/
+ *          |           |            |            |        |
+ *       scheme     authority       path        query   fragment
+ *          |   _____________________|__
+ *         / \ /                        \
+ *         urn:example:animal:ferret:nose
+ *
+ * 
+ * + * This is useful for classifying URLs for things like whether or not + * they're supported by an application. + * + * For example, the LinkValidationService supports http, and https, + * is willing to ignore certain well-formed URLs, but treats URLs + * will unknown and unsupported protocols as broken. Concretely, + * we'd like to avoid treating something like the following one + * as being non-broken even though you can't apply GET or HEAD + * to it. + * + *
+ * Email
+ * 
+ * + * As of June 2007,IANA had over 70 registered and provisional protocols + * listed at http://www.iana.org/assignments/uri-schemes.html but sometimes + * people create their own too (e.g.: cvs). Here's the official list: + *
+ *
+ *    aaa aaas acap afs cap cid crid data dav dict dns dtn fax file
+ *    ftp go gopher h323 http https iax2 icap im imap info ipp iris
+ *    iris.beep iris.lwz iris.xpc iris.xpcs ldap mailserver mailto
+ *    mid modem msrp msrps mtqp mupdate news nfs nntp opaquelocktoken
+ *    pop pres prospero rtsp service shttp sip sips snmp soap.beep
+ *    soap.beeps tag tel telnet tftp thismessage tip tn3270 tv urn
+ *    vemmi wais xmlrpc.beep xmlrpc.beeps xmpp z39.50r z39.50s
+ * 
+ * + */ +public class UriSchemeNameMatcher implements NameMatcher, Serializable +{ + /** + * The extensions to match. + */ + HashMap scheme_; + + /** + * Default constructor. + */ + public UriSchemeNameMatcher() + { + scheme_ = new HashMap(); + } + + /** + * Set the protocols case insensitively (cannonicalized to lower-case). + * + * @param protocols + */ + public void setExtensions(List protocols) + { + for (String protocol : protocols) + { + scheme_.put( protocol.toLowerCase(), null ); + } + } + + /** + * Returns true if the URL's protocol is in the of + * being matched. Everything up to but not including + * the intial colon is + */ + public boolean matches(String uri) + { + if ( uri == null ) { return false; } + + int colon_index = uri.indexOf(':'); + + if ( colon_index >= 0) + { + String proto = + uri.substring(0, colon_index).toLowerCase(); + + return scheme_.containsKey( proto ); + } + return false; + } +} +