iDay. Indexing and WebPreviewing of Archive (currently zip) files.

I've added 2 transformers: zip to text/plain and zip to pdf.
This means that zip files will be indexable and therefore searchable. They will also now have webpreviews.

In each transformer it is the names of the entries in the zip file that are output. Therefore the webpreview will show a listing of the zip contents (not recursive for zips in zips) and the searching will be against entry names within zips but not within the content of those entries.

Also added a test class and a quick.zip file for testing.
These changes required some extension points in AbstractContentTransformerTest to support the zip transformation testing.


git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@20580 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
Neil McErlean
2010-06-10 11:24:23 +00:00
parent 1062614187
commit 0aa0184907
6 changed files with 252 additions and 4 deletions

View File

@@ -165,7 +165,7 @@
</property>
</bean>
<!-- Metadata Extraction Regisitry -->
<!-- Metadata Extraction Registry -->
<bean id="metadataExtracterRegistry" class="org.alfresco.repo.content.metadata.MetadataExtracterRegistry" />
<!-- Abstract bean definition defining base definition for all metadata extracters -->
@@ -471,4 +471,18 @@
</property>
</bean>
<!-- This transformer allows for the indexing and therefore searching of zip files. -->
<bean id="transformer.Archive"
class="org.alfresco.repo.content.transform.ArchiveContentTransformer"
parent="baseContentTransformer" >
<property name="explicitTransformations">
<list>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" >
<property name="sourceMimetype"><value>application/zip</value></property>
<property name="targetMimetype"><value>text/plain</value></property>
</bean>
</list>
</property>
</bean>
</beans>

View File

@@ -57,4 +57,23 @@
</property>
</bean>
<!-- This transformer allows for the webpreviewing of zip archive files. -->
<bean id="transformer.complex.Archive.Pdf2swf"
class="org.alfresco.repo.content.transform.ComplexContentTransformer"
parent="baseContentTransformer" >
<property name="transformers">
<list>
<ref bean="transformer.Archive" />
<ref bean="transformer.PdfBox.TextToPdf" />
<ref bean="transformer.Pdf2swf" />
</list>
</property>
<property name="intermediateMimetypes">
<list>
<value>text/plain</value>
<value>application/pdf</value>
</list>
</property>
</bean>
</beans>

View File

@@ -229,7 +229,7 @@ public abstract class AbstractContentTransformerTest extends TestCase
transformer.transform(sourceReader.getReader(), targetWriter);
// if the target format is any type of text, then it must contain the 'quick' phrase
if (targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN))
if (isQuickPhraseExpected(targetMimetype))
{
ContentReader targetReader = targetWriter.getReader();
String checkContent = targetReader.getContentString();
@@ -239,7 +239,7 @@ public abstract class AbstractContentTransformerTest extends TestCase
" target: " + targetWriter,
checkContent.contains(QUICK_CONTENT));
}
else if (targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT))
else if (isQuickWordsExpected(targetMimetype))
{
ContentReader targetReader = targetWriter.getReader();
String checkContent = targetReader.getContentString();
@@ -281,6 +281,32 @@ public abstract class AbstractContentTransformerTest extends TestCase
outputWriter.putContent(sb.toString());
}
/**
* This method is an extension point for enabling/disabling an assertion that the "quick brown fox"
* phrase is present in the transformed content.
* By default, the phrase is expected in all text/plain outputs.
*
* @param targetMimetype mimetype of the target of the transformation
* @return <code>true</code> if phrase is expected else <code>false</code>.
*/
protected boolean isQuickPhraseExpected(String targetMimetype)
{
return targetMimetype.equals(MimetypeMap.MIMETYPE_TEXT_PLAIN);
}
/**
* This method is an extension point for enabling/disabling an assertion that the "quick brown fox"
* words are <i>each</i> present in the transformed content.
* By default, the words in the phrase are expected in all text/* outputs.
*
* @param targetMimetype mimetype of the target of the transformation
* @return <code>true</code> if each word is expected else <code>false</code>.
*/
protected boolean isQuickWordsExpected(String targetMimetype)
{
return targetMimetype.startsWith(StringExtractingContentTransformer.PREFIX_TEXT);
}
/**
* This method is an extension point for excluding certain transformations in a subclass.
* The default implementation returns <code>false</code> for all mime type pairs.

View File

@@ -0,0 +1,123 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.TransformationOptions;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* This class transforms archive files (currently only ZIPs) to text, which enables indexing
* and searching of archives as well as webpreviewing.
* The transformation simply lists the names of the entries within the zip file and does not consider their content.
*
* @author Neil McErlean
* @since Swift
*/
public class ArchiveContentTransformer extends AbstractContentTransformer2
{
/**
* The logger
*/
private static Log logger = LogFactory.getLog(ArchiveContentTransformer.class);
/**
* Currently the only transformation performed is that of text extraction from PDF documents.
*/
public boolean isTransformable(String sourceMimetype, String targetMimetype, TransformationOptions options)
{
// TODO: Expand to other archive types e.g. tar.
if (!MimetypeMap.MIMETYPE_ZIP.equals(sourceMimetype) ||
!MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
// Currently only support ZIP -> Text
return false;
}
else
{
return true;
}
}
protected void transformInternal(
ContentReader reader,
ContentWriter writer,
TransformationOptions options) throws Exception
{
InputStream is = null;
try
{
is = reader.getContentInputStream();
List<String> zipEntryNames = new ArrayList<String>();
ZipInputStream zin = new ZipInputStream(is);
// Enumerate each entry
ZipEntry nextZipEntry = null;
while ((nextZipEntry = zin.getNextEntry()) != null)
{
String entryName = nextZipEntry.getName();
zipEntryNames.add(entryName);
// Currently we do not recurse into 'zips within zips'.
}
if (logger.isDebugEnabled())
{
StringBuilder msg = new StringBuilder();
msg.append("Transformed ")
.append(zipEntryNames.size())
.append(zipEntryNames.size() == 1 ? " zip entry" : " zip entries");
logger.debug(msg.toString());
}
String text = createTextContentFrom(zipEntryNames);
// dump it all to the writer
writer.putContent(text);
}
finally
{
if (is != null)
{
try { is.close(); } catch (Throwable e) {e.printStackTrace(); }
}
}
}
private String createTextContentFrom(List<String> zipEntryNames)
{
StringBuilder result = new StringBuilder();
for (String entryName : zipEntryNames)
{
result.append(entryName)
.append('\n');
}
return result.toString();
}
}

View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2005-2010 Alfresco Software Limited.
*
* This file is part of Alfresco
*
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
*/
package org.alfresco.repo.content.transform;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.service.cmr.repository.TransformationOptions;
/**
* Test class for ArchiveContentTransformer.
*
* @see org.alfresco.repo.content.transform.ArchiveContentTransformer
*
* @author Neil McErlean
*/
public class ArchiveContentTransformerTest extends AbstractContentTransformerTest
{
private ContentTransformer transformer;
@Override
public void setUp() throws Exception
{
super.setUp();
transformer = new ArchiveContentTransformer();
}
protected ContentTransformer getTransformer(String sourceMimetype, String targetMimetype)
{
return transformer;
}
public void testIsTransformable() throws Exception
{
assertTrue(transformer.isTransformable(MimetypeMap.MIMETYPE_ZIP, MimetypeMap.MIMETYPE_TEXT_PLAIN, new TransformationOptions()));
}
@Override
protected boolean isQuickPhraseExpected(String targetMimetype)
{
// The Zip transformer produces names of the entries, not their contents.
return false;
}
@Override
protected boolean isQuickWordsExpected(String targetMimetype)
{
// The Zip transformer produces names of the entries, not their contents.
return false;
}
}

Binary file not shown.