ALF-9413: RSOLR 022: Fine-grained control of full-text indexing

- final part - supported in SOLR, added aspect support to explorer and share

git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@29192 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Andrew Hind
2011-07-19 15:02:21 +00:00
parent 663760cd08
commit ca67fe0aff
3 changed files with 145 additions and 101 deletions

View File

@@ -327,3 +327,9 @@ cm_contentmodel.property.exif_yResolution.description=Vertical resolution in pix
cm_contentmodel.property.exif_resolutionUnit.title=Resolution Unit cm_contentmodel.property.exif_resolutionUnit.title=Resolution Unit
cm_contentmodel.property.exif_resolutionUnit.description=Unit used for horizontal and vertical resolution cm_contentmodel.property.exif_resolutionUnit.description=Unit used for horizontal and vertical resolution
cm_contentmodel.aspect.cm_indexControl.title=Index Control
cm_contentmodel.aspect.cm_indexControl.description=Control Index Behaviour
cm_contentmodel.property.cm_isIndexed.title=Is Indexed
cm_contentmodel.property.cm_isIndexed.description=Is the node indexed and can be found via search.
cm_contentmodel.property.cm_isContentIndexed.title=Is Content Indexed
cm_contentmodel.property.cm_isContentIndexed.description=Are the node's d:content properties indexed?

View File

@@ -1419,10 +1419,12 @@
<property name="cm:isIndexed"> <property name="cm:isIndexed">
<title>Is indexed</title> <title>Is indexed</title>
<type>d:boolean</type> <type>d:boolean</type>
<default>true</default>
</property> </property>
<property name="cm:isContentIndexed"> <property name="cm:isContentIndexed">
<title>Is content indexed</title> <title>Is content indexed</title>
<type>d:boolean</type> <type>d:boolean</type>
<default>true</default>
</property> </property>
</properties> </properties>
</aspect> </aspect>

View File

@@ -476,16 +476,44 @@ public class AVMLuceneIndexerImpl extends AbstractLuceneIndexerImpl<String> impl
boolean isAtomic = true; boolean isAtomic = true;
Map<QName, Serializable> properties = getIndexableProperties(desc, nodeRef, endVersion, stringNodeRef); Map<QName, Serializable> properties = getIndexableProperties(desc, nodeRef, endVersion, stringNodeRef);
if(properties.containsKey(ContentModel.PROP_IS_INDEXED))
{
Serializable sValue = properties.get(ContentModel.PROP_IS_INDEXED);
if(sValue != null)
{
Boolean isIndexed = DefaultTypeConverter.INSTANCE.convert(Boolean.class, sValue);
if((isIndexed != null) && (isIndexed.booleanValue() == false))
{
return docs;
}
}
}
boolean isContentIndexedForNode = true;
if(properties.containsKey(ContentModel.PROP_IS_CONTENT_INDEXED))
{
Serializable sValue = properties.get(ContentModel.PROP_IS_CONTENT_INDEXED);
if(sValue != null)
{
Boolean isIndexed = DefaultTypeConverter.INSTANCE.convert(Boolean.class, sValue);
if((isIndexed != null) && (isIndexed.booleanValue() == false))
{
isContentIndexedForNode = false;
}
}
}
for (QName propertyName : properties.keySet()) for (QName propertyName : properties.keySet())
{ {
Serializable value = properties.get(propertyName); Serializable value = properties.get(propertyName);
if (indexAllProperties) if (indexAllProperties)
{ {
indexProperty(nodeRef, propertyName, value, xdoc, false, properties); indexProperty(nodeRef, propertyName, value, xdoc, false, properties, isContentIndexedForNode);
} }
else else
{ {
isAtomic &= indexProperty(nodeRef, propertyName, value, xdoc, true, properties); isAtomic &= indexProperty(nodeRef, propertyName, value, xdoc, true, properties,isContentIndexedForNode);
} }
} }
@@ -722,7 +750,7 @@ public class AVMLuceneIndexerImpl extends AbstractLuceneIndexerImpl<String> impl
} }
} }
protected boolean indexProperty(NodeRef banana, QName propertyName, Serializable value, Document doc, boolean indexAtomicPropertiesOnly, Map<QName, Serializable> properties) protected boolean indexProperty(NodeRef banana, QName propertyName, Serializable value, Document doc, boolean indexAtomicPropertiesOnly, Map<QName, Serializable> properties, boolean isContentIndexedForNode)
{ {
String attributeName = "@" + QName.createQName(propertyName.getNamespaceURI(), ISO9075.encode(propertyName.getLocalName())); String attributeName = "@" + QName.createQName(propertyName.getNamespaceURI(), ISO9075.encode(propertyName.getLocalName()));
@@ -811,119 +839,127 @@ public class AVMLuceneIndexerImpl extends AbstractLuceneIndexerImpl<String> impl
} }
doc.add(new Field(attributeName + ".locale", locale.toString().toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.add(new Field(attributeName + ".locale", locale.toString().toLowerCase(), Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
ContentReader reader = null;
try
{
reader = contentService.getRawReader(contentData.getContentUrl());
reader.setEncoding(contentData.getEncoding());
reader.setLocale(contentData.getLocale());
reader.setMimetype(contentData.getMimetype());
}
catch (Exception e)
{
reader = null;
}
// ContentReader reader = contentService.getReader(banana, propertyName);
if (reader != null && reader.exists())
{
boolean readerReady = true;
// transform if necessary (it is not a UTF-8 text document)
if (!EqualsHelper.nullSafeEquals(reader.getMimetype(), MimetypeMap.MIMETYPE_TEXT_PLAIN) || !EqualsHelper.nullSafeEquals(reader.getEncoding(), "UTF-8"))
{
// get the transformer
ContentTransformer transformer = contentService.getTransformer(reader.getMimetype(), MimetypeMap.MIMETYPE_TEXT_PLAIN);
// is this transformer good enough?
if (transformer == null)
{
// log it
if (s_logger.isDebugEnabled())
{
s_logger.debug("Not indexed: No transformation: \n" + " source: " + reader + "\n" + " target: " + MimetypeMap.MIMETYPE_TEXT_PLAIN);
}
// don't index from the reader
readerReady = false;
// not indexed: no transformation
// doc.add(new Field("TEXT", NOT_INDEXED_NO_TRANSFORMATION, Field.Store.NO,
// Field.Index.TOKENIZED, Field.TermVector.NO));
doc.add(new Field(attributeName, NOT_INDEXED_NO_TRANSFORMATION, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
}
// else if (indexAtomicPropertiesOnly
// && transformer.getTransformationTime() > maxAtomicTransformationTime)
// {
// only indexing atomic properties
// indexing will take too long, so push it to the background
// wereAllAtomic = false;
// }
else
{
// We have a transformer that is fast enough
ContentWriter writer = contentService.getTempWriter();
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
// this is what the analyzers expect on the stream
writer.setEncoding("UTF-8");
try
{
transformer.transform(reader, writer); if(isContentIndexedForNode)
// point the reader to the new-written content {
reader = writer.getReader(); ContentReader reader = null;
// Check that the reader is a view onto something concrete try
if (!reader.exists()) {
{ reader = contentService.getRawReader(contentData.getContentUrl());
throw new ContentIOException("The transformation did not write any content, yet: \n" reader.setEncoding(contentData.getEncoding());
+ " transformer: " + transformer + "\n" + " temp writer: " + writer); reader.setLocale(contentData.getLocale());
} reader.setMimetype(contentData.getMimetype());
} }
catch (ContentIOException e) catch (Exception e)
{
reader = null;
}
// ContentReader reader = contentService.getReader(banana, propertyName);
if (reader != null && reader.exists())
{
boolean readerReady = true;
// transform if necessary (it is not a UTF-8 text document)
if (!EqualsHelper.nullSafeEquals(reader.getMimetype(), MimetypeMap.MIMETYPE_TEXT_PLAIN) || !EqualsHelper.nullSafeEquals(reader.getEncoding(), "UTF-8"))
{
// get the transformer
ContentTransformer transformer = contentService.getTransformer(reader.getMimetype(), MimetypeMap.MIMETYPE_TEXT_PLAIN);
// is this transformer good enough?
if (transformer == null)
{ {
// log it // log it
if (s_logger.isDebugEnabled()) if (s_logger.isDebugEnabled())
{ {
s_logger.debug("Not indexed: Transformation failed", e); s_logger.debug("Not indexed: No transformation: \n" + " source: " + reader + "\n" + " target: " + MimetypeMap.MIMETYPE_TEXT_PLAIN);
} }
// don't index from the reader // don't index from the reader
readerReady = false; readerReady = false;
// not indexed: transformation // not indexed: no transformation
// failed // doc.add(new Field("TEXT", NOT_INDEXED_NO_TRANSFORMATION, Field.Store.NO,
// doc.add(new Field("TEXT", NOT_INDEXED_TRANSFORMATION_FAILED, Field.Store.NO,
// Field.Index.TOKENIZED, Field.TermVector.NO)); // Field.Index.TOKENIZED, Field.TermVector.NO));
doc.add(new Field(attributeName, NOT_INDEXED_TRANSFORMATION_FAILED, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.add(new Field(attributeName, NOT_INDEXED_NO_TRANSFORMATION, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
}
// else if (indexAtomicPropertiesOnly
// && transformer.getTransformationTime() > maxAtomicTransformationTime)
// {
// only indexing atomic properties
// indexing will take too long, so push it to the background
// wereAllAtomic = false;
// }
else
{
// We have a transformer that is fast enough
ContentWriter writer = contentService.getTempWriter();
writer.setMimetype(MimetypeMap.MIMETYPE_TEXT_PLAIN);
// this is what the analyzers expect on the stream
writer.setEncoding("UTF-8");
try
{
transformer.transform(reader, writer);
// point the reader to the new-written content
reader = writer.getReader();
// Check that the reader is a view onto something concrete
if (!reader.exists())
{
throw new ContentIOException("The transformation did not write any content, yet: \n"
+ " transformer: " + transformer + "\n" + " temp writer: " + writer);
}
}
catch (ContentIOException e)
{
// log it
if (s_logger.isDebugEnabled())
{
s_logger.debug("Not indexed: Transformation failed", e);
}
// don't index from the reader
readerReady = false;
// not indexed: transformation
// failed
// doc.add(new Field("TEXT", NOT_INDEXED_TRANSFORMATION_FAILED, Field.Store.NO,
// Field.Index.TOKENIZED, Field.TermVector.NO));
doc.add(new Field(attributeName, NOT_INDEXED_TRANSFORMATION_FAILED, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
}
} }
} }
// add the text field using the stream from the
// reader, but only if the reader is valid
if (readerReady)
{
InputStreamReader isr = null;
InputStream ris = reader.getReader().getContentInputStream();
try
{
isr = new InputStreamReader(ris, "UTF-8");
}
catch (UnsupportedEncodingException e)
{
isr = new InputStreamReader(ris);
}
StringBuilder builder = new StringBuilder();
builder.append("\u0000").append(locale.toString()).append("\u0000");
StringReader prefix = new StringReader(builder.toString());
Reader multiReader = new MultiReader(prefix, isr);
doc.add(new Field(attributeName, multiReader, Field.TermVector.NO));
}
} }
// add the text field using the stream from the else
// reader, but only if the reader is valid // URL not present (null reader) or no content at the URL (file missing)
if (readerReady)
{ {
InputStreamReader isr = null; // log it
InputStream ris = reader.getReader().getContentInputStream(); if (s_logger.isDebugEnabled())
try
{ {
isr = new InputStreamReader(ris, "UTF-8"); s_logger.debug("Not indexed: Content Missing \n"
+ " node: " + banana + "\n" + " reader: " + reader + "\n" + " content exists: "
+ (reader == null ? " --- " : Boolean.toString(reader.exists())));
} }
catch (UnsupportedEncodingException e) // not indexed: content missing
{ doc.add(new Field(attributeName, NOT_INDEXED_CONTENT_MISSING, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
isr = new InputStreamReader(ris);
}
StringBuilder builder = new StringBuilder();
builder.append("\u0000").append(locale.toString()).append("\u0000");
StringReader prefix = new StringReader(builder.toString());
Reader multiReader = new MultiReader(prefix, isr);
doc.add(new Field(attributeName, multiReader, Field.TermVector.NO));
} }
} }
else else
// URL not present (null reader) or no content at the URL (file missing)
{ {
// log it return true;
if (s_logger.isDebugEnabled())
{
s_logger.debug("Not indexed: Content Missing \n"
+ " node: " + banana + "\n" + " reader: " + reader + "\n" + " content exists: "
+ (reader == null ? " --- " : Boolean.toString(reader.exists())));
}
// not indexed: content missing
doc.add(new Field(attributeName, NOT_INDEXED_CONTENT_MISSING, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
} }
} }
else else
@@ -1871,7 +1907,7 @@ public class AVMLuceneIndexerImpl extends AbstractLuceneIndexerImpl<String> impl
{ {
indexedDocCount++; indexedDocCount++;
} }
public int getLastIndexedSnapshot(String store) public int getLastIndexedSnapshot(String store)
{ {
int last = getLastAsynchronousSnapshot(store); int last = getLastAsynchronousSnapshot(store);
@@ -1886,7 +1922,7 @@ public class AVMLuceneIndexerImpl extends AbstractLuceneIndexerImpl<String> impl
} }
return hasIndexBeenCreated(store) ? 0 : -1; return hasIndexBeenCreated(store) ? 0 : -1;
} }
private int getLastSynchronousSnapshot(String store) private int getLastSynchronousSnapshot(String store)
{ {
int answer = getLastSynchronousSnapshot(store, IndexChannel.DELTA); int answer = getLastSynchronousSnapshot(store, IndexChannel.DELTA);
@@ -2110,7 +2146,7 @@ public class AVMLuceneIndexerImpl extends AbstractLuceneIndexerImpl<String> impl
*/ */
public void deleteIndex(StoreRef storeRef) public void deleteIndex(StoreRef storeRef)
{ {
deleteIndex(); deleteIndex();
} }
} }