Extended Path parsing ...

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2111 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2006-01-13 16:28:17 +00:00
parent 985138446c
commit cd72520330
2 changed files with 194 additions and 36 deletions

View File

@@ -29,8 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
/** /**
* @author andyh * @author andyh
* *
* TODO To change the template for this generated type comment go to Window - * TODO To change the template for this generated type comment go to Window - Preferences - Java - Code Style - Code Templates
* Preferences - Java - Code Style - Code Templates
*/ */
public class PathTokenFilter extends Tokenizer public class PathTokenFilter extends Tokenizer
{ {
@@ -53,6 +52,8 @@ public class PathTokenFilter extends Tokenizer
public final static String TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME"; public final static String TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME";
public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE"; public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE";
public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX";
char pathSeparator; char pathSeparator;
@@ -68,6 +69,8 @@ public class PathTokenFilter extends Tokenizer
int nsEndDelimiterLength; int nsEndDelimiterLength;
char nsPrefixDelimiter = ':';
LinkedList<Token> tokens = new LinkedList<Token>(); LinkedList<Token> tokens = new LinkedList<Token>();
Iterator<Token> it = null; Iterator<Token> it = null;
@@ -129,13 +132,14 @@ public class PathTokenFilter extends Tokenizer
while ((t = nextToken()) != null) while ((t = nextToken()) != null)
{ {
String text = t.termText(); String text = t.termText();
if((text.length() == 0) || text.equals(pathSeparator)) if (text.length() == 0)
{ {
break; continue; // Skip if we find // or /; or ;; etc
} }
if (text.charAt(text.length()-1) == pathSeparator) if (text.charAt(text.length() - 1) == pathSeparator)
{ {
text = text.substring(0, text.length() - 1); text = text.substring(0, text.length() - 1);
pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP); pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
@@ -144,11 +148,19 @@ public class PathTokenFilter extends Tokenizer
} }
int split = -1; int split = -1;
boolean isPrefix = false;
if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter)) if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter))
{ {
split = text.indexOf(nsEndDelimiter); split = text.indexOf(nsEndDelimiter);
} }
if (split == -1)
{
split = text.indexOf(nsPrefixDelimiter);
isPrefix = true;
}
if (split == -1) if (split == -1)
{ {
namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(), namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
@@ -158,10 +170,21 @@ public class PathTokenFilter extends Tokenizer
} }
else else
{ {
namespaceToken = new Token(text.substring(nsStartDelimiterLength, (split + nsEndDelimiterLength - 1)), if (isPrefix)
t.startOffset(), t.startOffset() + split, TOKEN_TYPE_PATH_ELEMENT_NAMESPACE); {
nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset() + split namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split,
+ nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME); TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX);
nameToken = new Token(text.substring(split + 1), t.startOffset()
+ split + 1, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
}
else
{
namespaceToken = new Token(text.substring(nsStartDelimiterLength,
(split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split,
TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset()
+ split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
}
} }
namespaceToken.setPositionIncrement(1); namespaceToken.setPositionIncrement(1);
@@ -190,7 +213,6 @@ public class PathTokenFilter extends Tokenizer
pathSplitToken = null; pathSplitToken = null;
} }
} }
String countString = nf.format(lengthCounter); String countString = nf.format(lengthCounter);
@@ -208,12 +230,12 @@ public class PathTokenFilter extends Tokenizer
it = tokens.iterator(); it = tokens.iterator();
} }
int readerPosition = 0; int readerPosition = 0;
private Token nextToken() throws IOException private Token nextToken() throws IOException
{ {
if(readerPosition == -1) if (readerPosition == -1)
{ {
return null; return null;
} }
@@ -222,34 +244,39 @@ public class PathTokenFilter extends Tokenizer
int start = readerPosition; int start = readerPosition;
int current; int current;
char c; char c;
while((current = input.read()) != -1) while ((current = input.read()) != -1)
{ {
c = (char)current; c = (char) current;
readerPosition++; readerPosition++;
if(c == nsStartDelimiter) if (c == nsStartDelimiter)
{ {
inNameSpace = true; inNameSpace = true;
} }
else if(c == nsEndDelimiter) else if (c == nsEndDelimiter)
{ {
inNameSpace = false; inNameSpace = false;
} }
else if(!inNameSpace && (c == '/')) else if (!inNameSpace && (c == '/'))
{ {
return new Token(buffer.toString(), start, readerPosition-1, "QNAME"); return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
} }
buffer.append(c); else if (!inNameSpace && (c == ';'))
} {
buffer.append(c);
return new Token(buffer.toString(), start, readerPosition , "LASTQNAME");
}
buffer.append(c);
}
readerPosition = -1; readerPosition = -1;
if(!inNameSpace) if (!inNameSpace)
{ {
return new Token(buffer.toString(), start, readerPosition-1, "QNAME"); return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
} }
else else
{ {
throw new IllegalStateException("QName terminated incorrectly: "+buffer.toString()); throw new IllegalStateException("QName terminated incorrectly: " + buffer.toString());
} }
} }
} }

View File

@@ -0,0 +1,131 @@
/*
* Copyright (C) 2005 Alfresco, Inc.
*
* Licensed under the Mozilla Public License version 1.1
* with a permitted attribution clause. You may obtain a
* copy of the License at
*
* http://www.alfresco.org/legal/license.txt
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package org.alfresco.repo.search.impl.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import junit.framework.TestCase;
public class PathTokenFilterTest extends TestCase
{
public PathTokenFilterTest()
{
super();
}
public PathTokenFilterTest(String arg0)
{
super(arg0);
}
public void testFullPath() throws IOException
{
tokenise("{uri1}one", new String[]{"uri1", "one"});
tokenise("/{uri1}one", new String[]{"uri1", "one"});
tokenise("{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/{uri1}one/{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/{uri1}one/{uri2}two/{uri3}three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
public void testPrefixPath() throws IOException
{
tokenise("uri1:one", new String[]{"uri1", "one"});
tokenise("/uri1:one", new String[]{"uri1", "one"});
tokenise("uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/uri1:one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/uri1:one/uri2:two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
public void testMixedPath() throws IOException
{
tokenise("{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("/{uri1}one/uri2:two/", new String[]{"uri1", "one", "uri2", "two"});
tokenise("uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
tokenise("/uri1:one/{uri2}two/uri3:three", new String[]{"uri1", "one", "uri2", "two", "uri3", "three"});
try
{
tokenise("{uri1}one;{uri2}two/", new String[]{"uri1", "one", "uri2", "two"});
}
catch(IllegalStateException ise)
{
}
}
private void tokenise(String path, String[] tokens) throws IOException
{
StringReader reader = new StringReader(path);
TokenStream ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
Token t;
int i = 0;
while( (t = ts.next()) != null)
{
if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE))
{
assert(i % 2 == 0);
assertEquals(t.termText(), tokens[i++]);
}
else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX))
{
assert(i % 2 == 0);
assertEquals(t.termText(), tokens[i++]);
}
else if(t.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME))
{
assert(i % 2 == 1);
assertEquals(t.termText(), tokens[i++]);
}
}
if(i != tokens.length)
{
fail("Invalid number of tokens, found "+i+" and expected "+tokens.length);
}
}
}