Extended Path parsing ...

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2111 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2006-01-13 16:28:17 +00:00
parent 985138446c
commit cd72520330
2 changed files with 194 additions and 36 deletions

View File

@@ -29,8 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
/**
* @author andyh
*
* TODO To change the template for this generated type comment go to Window -
* Preferences - Java - Code Style - Code Templates
* TODO To change the template for this generated type comment go to Window - Preferences - Java - Code Style - Code Templates
*/
public class PathTokenFilter extends Tokenizer
{
@@ -54,6 +53,8 @@ public class PathTokenFilter extends Tokenizer
public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE";
public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX";
char pathSeparator;
String separatorTokenText;
@@ -68,6 +69,8 @@ public class PathTokenFilter extends Tokenizer
int nsEndDelimiterLength;
char nsPrefixDelimiter = ':';
LinkedList<Token> tokens = new LinkedList<Token>();
Iterator<Token> it = null;
@@ -130,12 +133,13 @@ public class PathTokenFilter extends Tokenizer
{
String text = t.termText();
if((text.length() == 0) || text.equals(pathSeparator))
if (text.length() == 0)
{
break;
continue; // Skip if we find // or /; or ;; etc
}
if (text.charAt(text.length()-1) == pathSeparator)
if (text.charAt(text.length() - 1) == pathSeparator)
{
text = text.substring(0, text.length() - 1);
pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
@@ -144,11 +148,19 @@ public class PathTokenFilter extends Tokenizer
}
int split = -1;
boolean isPrefix = false;
if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter))
{
split = text.indexOf(nsEndDelimiter);
}
if (split == -1)
{
split = text.indexOf(nsPrefixDelimiter);
isPrefix = true;
}
if (split == -1)
{
namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
@@ -158,10 +170,21 @@ public class PathTokenFilter extends Tokenizer
}
else
{
namespaceToken = new Token(text.substring(nsStartDelimiterLength, (split + nsEndDelimiterLength - 1)),
t.startOffset(), t.startOffset() + split, TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset() + split
+ nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
if (isPrefix)
{
namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split,
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX);
nameToken = new Token(text.substring(split + 1), t.startOffset()
+ split + 1, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
}
else
{
namespaceToken = new Token(text.substring(nsStartDelimiterLength,
(split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split,
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset()
+ split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
}
}
namespaceToken.setPositionIncrement(1);
@@ -190,7 +213,6 @@ public class PathTokenFilter extends Tokenizer
pathSplitToken = null;
}
}
String countString = nf.format(lengthCounter);
@@ -213,7 +235,7 @@ public class PathTokenFilter extends Tokenizer
private Token nextToken() throws IOException
{
if(readerPosition == -1)
if (readerPosition == -1)
{
return null;
}
@@ -222,34 +244,39 @@ public class PathTokenFilter extends Tokenizer
int start = readerPosition;
int current;
char c;
while((current = input.read()) != -1)
while ((current = input.read()) != -1)
{
c = (char)current;
readerPosition++;
if(c == nsStartDelimiter)
{
inNameSpace = true;
}
else if(c == nsEndDelimiter)
{
inNameSpace = false;
}
else if(!inNameSpace && (c == '/'))
{
return new Token(buffer.toString(), start, readerPosition-1, "QNAME");
}
buffer.append(c);
c = (char) current;
readerPosition++;
if (c == nsStartDelimiter)
{
inNameSpace = true;
}
else if (c == nsEndDelimiter)
{
inNameSpace = false;
}
else if (!inNameSpace && (c == '/'))
{
return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
}
else if (!inNameSpace && (c == ';'))
{
buffer.append(c);
return new Token(buffer.toString(), start, readerPosition , "LASTQNAME");
}
buffer.append(c);
}
readerPosition = -1;
if(!inNameSpace)
if (!inNameSpace)
{
return new Token(buffer.toString(), start, readerPosition-1, "QNAME");
return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
}
else
{
throw new IllegalStateException("QName terminated incorrectly: "+buffer.toString());
throw new IllegalStateException("QName terminated incorrectly: " + buffer.toString());
}
}
}

View File

@@ -0,0 +1,131 @@
/*
* Copyright (C) 2005 Alfresco, Inc.
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import junit.framework.TestCase;
public class PathTokenFilterTest extends TestCase
{
public PathTokenFilterTest()
{
super();
}
public PathTokenFilterTest(String arg0)
{
super(arg0);
}
public void testFullPath() throws IOException
{
tokenise("{uri1}one", new String[]{"uri1", "one"});
tokenise("/{uri1}one", new String[]{"uri1", "one"});
tokenise("{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
public void testPrefixPath() throws IOException
{
tokenise("uri1:one", new String[]{"uri1", "one"});
tokenise("/uri1:one", new String[]{"uri1", "one"});
tokenise("uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
public void testMixedPath() throws IOException
{
tokenise("{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
private void tokenise(String path, String[] tokens) throws IOException
{
StringReader reader = new StringReader(path);
TokenStream ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
Token t;
int i = 0;
while( (t = ts.next()) != null)
{
if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE))
{
assert(i % 2 == 0);
assertEquals(t.termText(), tokens[i++]);
}
else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX))
{
assert(i % 2 == 0);
assertEquals(t.termText(), tokens[i++]);
}
else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME))
{
assert(i % 2 == 1);
assertEquals(t.termText(), tokens[i++]);
}
}
if(i != tokens.length)
{
fail("Invalid number of tokens, found "+i+" and expected "+tokens.length);
}
}
}