Extended Path parsing ...

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2111 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
2025-07-24 17:32:48 +00:00 · 2006-01-13 16:28:17 +00:00
parent 985138446c
commit cd72520330
2 changed files with 194 additions and 36 deletions
--- a/source/java/org/alfresco/repo/search/impl/lucene/analysis/PathTokenFilter.java
+++ b/source/java/org/alfresco/repo/search/impl/lucene/analysis/PathTokenFilter.java
@@ -29,8 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
 /**
 * @author andyh
 * 
- * TODO To change the template for this generated type comment go to Window -
+ * TODO To change the template for this generated type comment go to Window - Preferences - Java - Code Style - Code Templates
 * Preferences - Java - Code Style - Code Templates
 */
 public class PathTokenFilter extends Tokenizer
 {
@@ -53,6 +52,8 @@ public class PathTokenFilter extends Tokenizer
    public final static String TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME";
    public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE";
    public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX";
    char pathSeparator;
@@ -68,6 +69,8 @@ public class PathTokenFilter extends Tokenizer
    int nsEndDelimiterLength;
    char nsPrefixDelimiter = ':';
    LinkedList<Token> tokens = new LinkedList<Token>();
    Iterator<Token> it = null;
@@ -129,13 +132,14 @@ public class PathTokenFilter extends Tokenizer
        while ((t = nextToken()) != null)
        {
            String text = t.termText();
-            
+
-            if((text.length() == 0) || text.equals(pathSeparator))
+            if (text.length() == 0)
            {
-                break;
+                continue; //  Skip  if we find // or /; or ;; etc 
            }
-            if (text.charAt(text.length()-1) == pathSeparator)
+            if (text.charAt(text.length() - 1) == pathSeparator)
            {
                text = text.substring(0, text.length() - 1);
                pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
@@ -144,11 +148,19 @@ public class PathTokenFilter extends Tokenizer
            }
            int split = -1;
            boolean isPrefix = false;
            if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter))
            {
                split = text.indexOf(nsEndDelimiter);
            }
            if (split == -1)
            {
                split = text.indexOf(nsPrefixDelimiter);
                isPrefix = true;
            }
            if (split == -1)
            {
                namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
@@ -158,10 +170,21 @@ public class PathTokenFilter extends Tokenizer
            }
            else
            {
-                namespaceToken = new Token(text.substring(nsStartDelimiterLength, (split + nsEndDelimiterLength - 1)),
+                if (isPrefix)
-                        t.startOffset(), t.startOffset() + split, TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
+                {
-                nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset() + split
+                    namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split,
-                        + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
+                            TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX);
                    nameToken = new Token(text.substring(split + 1), t.startOffset()
                            + split + 1, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
                }
                else
                {
                    namespaceToken = new Token(text.substring(nsStartDelimiterLength,
                            (split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split,
                            TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
                    nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset()
                            + split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
                }
            }
            namespaceToken.setPositionIncrement(1);
@@ -190,7 +213,6 @@ public class PathTokenFilter extends Tokenizer
                pathSplitToken = null;
            }
        }
        String countString = nf.format(lengthCounter);
@@ -208,12 +230,12 @@ public class PathTokenFilter extends Tokenizer
        it = tokens.iterator();
    }
-    
+
    int readerPosition = 0;
-    
+
    private Token nextToken() throws IOException
    {
-        if(readerPosition == -1)
+        if (readerPosition == -1)
        {
            return null;
        }
@@ -222,34 +244,39 @@ public class PathTokenFilter extends Tokenizer
        int start = readerPosition;
        int current;
        char c;
-        while((current = input.read()) != -1)
+        while ((current = input.read()) != -1)
        {
-           c = (char)current;
+            c = (char) current;
-           readerPosition++;
+            readerPosition++;
-           if(c == nsStartDelimiter)
+            if (c == nsStartDelimiter)
-           {
+            {
-               inNameSpace = true;
+                inNameSpace = true;
-           }
+            }
-           else if(c == nsEndDelimiter)
+            else if (c == nsEndDelimiter)
-           {
+            {
-               inNameSpace = false;
+                inNameSpace = false;
-           }
+            }
-           else if(!inNameSpace && (c == '/'))
+            else if (!inNameSpace && (c == '/'))
-           {
+            {
-               return new Token(buffer.toString(), start, readerPosition-1, "QNAME");
+                return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
-           }
+            }
-           buffer.append(c);
+            else if (!inNameSpace && (c == ';'))
-        } 
+            {
                buffer.append(c);
                return new Token(buffer.toString(), start, readerPosition , "LASTQNAME");
            }
            buffer.append(c);
        }
        readerPosition = -1;
-        if(!inNameSpace)
+        if (!inNameSpace)
        {
-            return new Token(buffer.toString(), start, readerPosition-1, "QNAME");
+            return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
        }
        else
        {
-            throw new IllegalStateException("QName terminated incorrectly: "+buffer.toString());
+            throw new IllegalStateException("QName terminated incorrectly: " + buffer.toString());
        }
-        
+
    }
 }
--- a/source/java/org/alfresco/repo/search/impl/lucene/analysis/PathTokenFilterTest.java
+++ b/source/java/org/alfresco/repo/search/impl/lucene/analysis/PathTokenFilterTest.java
@@ -0,0 +1,131 @@
 /*
 * Copyright (C) 2005 Alfresco, Inc.
 *
 * Licensed under the Mozilla Public License version 1.1 
 * with a permitted attribution clause. You may obtain a
 * copy of the License at
 *
 *   http://www.alfresco.org/legal/license.txt
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific
 * language governing permissions and limitations under the
 * License.
 */
 package org.alfresco.repo.search.impl.lucene.analysis;
 import java.io.IOException;
 import java.io.StringReader;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import junit.framework.TestCase;
 public class PathTokenFilterTest extends TestCase
 {
    public PathTokenFilterTest()
    {
        super();
    }
    public PathTokenFilterTest(String arg0)
    {
        super(arg0);
    }
    public void testFullPath() throws IOException
    {
        tokenise("{uri1}one", new String[]{"uri1", "one"});
        tokenise("/{uri1}one", new String[]{"uri1", "one"});
        tokenise("{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
        tokenise("/{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
        tokenise("{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
        tokenise("/{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
        try
        {
           tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
        }
        catch(IllegalStateException ise)
        {
        }
    }
    public void testPrefixPath() throws IOException
    {
        tokenise("uri1:one", new String[]{"uri1", "one"});
        tokenise("/uri1:one", new String[]{"uri1", "one"});
        tokenise("uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
        tokenise("/uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
        tokenise("uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
        tokenise("/uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
        try
        {
           tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
        }
        catch(IllegalStateException ise)
        {
        }
    }
    public void testMixedPath() throws IOException
    {
        tokenise("{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
        tokenise("/{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
        tokenise("uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
        tokenise("/uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
        try
        {
           tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
        }
        catch(IllegalStateException ise)
        {
        }
    }
    private void tokenise(String path, String[] tokens) throws IOException
    {
        StringReader reader = new StringReader(path);
        TokenStream ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
                PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
                PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
       Token t;
       int i = 0;
       while( (t = ts.next()) != null)
       {
           if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE))
           {
               assert(i % 2 == 0);
               assertEquals(t.termText(), tokens[i++]);
           }
           else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX))
           {
               assert(i % 2 == 0);
               assertEquals(t.termText(), tokens[i++]);
           }
           else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME))
           {
               assert(i % 2 == 1);
               assertEquals(t.termText(), tokens[i++]);
           }
       }
       if(i != tokens.length)
       {
           fail("Invalid number of tokens, found "+i+" and expected "+tokens.length);
       }
    }
 }