mirror of
https://github.com/Alfresco/alfresco-community-repo.git
synced 2025-07-24 17:32:48 +00:00
Moving to root below branch label
git-svn-id: https://svn.alfresco.com/repos/alfresco-enterprise/alfresco/HEAD/root@2005 c4b6b30b-aa2e-2d43-bbcb-ca4b014f7261
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
abstract public class AbstractMetadataExtracter implements MetadataExtracter
|
||||
{
|
||||
|
||||
private Set<String> mimetypes;
|
||||
private double reliability;
|
||||
private long extractionTime;
|
||||
|
||||
protected AbstractMetadataExtracter(String mimetype, double reliability, long extractionTime)
|
||||
{
|
||||
this.mimetypes = Collections.singleton(mimetype);
|
||||
this.reliability = reliability;
|
||||
this.extractionTime = extractionTime;
|
||||
}
|
||||
|
||||
protected AbstractMetadataExtracter(Set<String> mimetypes, double reliability, long extractionTime)
|
||||
{
|
||||
this.mimetypes = mimetypes;
|
||||
this.reliability = reliability;
|
||||
this.extractionTime = extractionTime;
|
||||
}
|
||||
|
||||
public double getReliability(String sourceMimetype)
|
||||
{
|
||||
if (mimetypes.contains(sourceMimetype))
|
||||
return reliability;
|
||||
else
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
public long getExtractionTime()
|
||||
{
|
||||
return extractionTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* Examines a value or string for nulls and adds it to the map (if
|
||||
* non-empty)
|
||||
*
|
||||
* @param prop Alfresco's <code>ContentModel.PROP_</code> to set.
|
||||
* @param value Value to set it to
|
||||
* @param destination Map into which to set it
|
||||
* @return true, if set, false otherwise
|
||||
*/
|
||||
protected boolean trimPut(QName prop, Object value, Map<QName, Serializable> destination)
|
||||
{
|
||||
if (value == null)
|
||||
return false;
|
||||
if (value instanceof String)
|
||||
{
|
||||
String svalue = ((String) value).trim();
|
||||
if (svalue.length() > 0)
|
||||
{
|
||||
destination.put(prop, svalue);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
else if (value instanceof Serializable)
|
||||
{
|
||||
destination.put(prop, (Serializable) value);
|
||||
}
|
||||
else
|
||||
{
|
||||
destination.put(prop, value.toString());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.repo.content.filestore.FileContentReader;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.alfresco.util.BaseSpringTest;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
|
||||
/**
|
||||
* Provides a base set of tests for testing
|
||||
* {@link org.alfresco.repo.content.metadata.MetadataExtracter} implementations.
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public abstract class AbstractMetadataExtracterTest extends BaseSpringTest
|
||||
{
|
||||
protected static final String QUICK_TITLE = "The quick brown fox jumps over the lazy dog";
|
||||
protected static final String QUICK_DESCRIPTION = "Gym class featuring a brown fox and lazy dog";
|
||||
protected static final String QUICK_CREATOR = "Nevin Nollop";
|
||||
protected static final String[] QUICK_WORDS = new String[] { "quick", "brown", "fox", "jumps", "lazy", "dog" };
|
||||
|
||||
protected MimetypeMap mimetypeMap;
|
||||
protected MetadataExtracter transformer;
|
||||
|
||||
public final void setMimetypeMap(MimetypeMap mimetypeMap)
|
||||
{
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
}
|
||||
|
||||
protected abstract MetadataExtracter getExtracter();
|
||||
|
||||
/**
|
||||
* Ensures that the temp locations are cleaned out before the tests start
|
||||
*/
|
||||
@Override
|
||||
protected void onSetUpInTransaction() throws Exception
|
||||
{
|
||||
// perform a little cleaning up
|
||||
long now = System.currentTimeMillis();
|
||||
TempFileProvider.TempFileCleanerJob.removeFiles(now);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that all objects are present
|
||||
*/
|
||||
public void testSetUp() throws Exception
|
||||
{
|
||||
assertNotNull("MimetypeMap not present", mimetypeMap);
|
||||
// check that the quick resources are available
|
||||
File sourceFile = AbstractMetadataExtracterTest.loadQuickTestFile("txt");
|
||||
assertNotNull("quick.* files should be available from Tests", sourceFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to load one of the "The quick brown fox" files from the
|
||||
* classpath.
|
||||
*
|
||||
* @param extension the extension of the file required
|
||||
* @return Returns a test resource loaded from the classpath or
|
||||
* <tt>null</tt> if no resource could be found.
|
||||
* @throws IOException
|
||||
*/
|
||||
public static File loadQuickTestFile(String extension) throws IOException
|
||||
{
|
||||
URL url = AbstractMetadataExtracterTest.class.getClassLoader().getResource("quick/quick." + extension);
|
||||
if (url == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
File file = new File(url.getFile());
|
||||
if (!file.exists())
|
||||
{
|
||||
return null;
|
||||
}
|
||||
return file;
|
||||
}
|
||||
|
||||
public Map<QName, Serializable> extractFromExtension(String ext, String mimetype) throws Exception
|
||||
{
|
||||
Map<QName, Serializable> destination = new HashMap<QName, Serializable>();
|
||||
|
||||
// attempt to get a source file for each mimetype
|
||||
File sourceFile = AbstractMetadataExtracterTest.loadQuickTestFile(ext);
|
||||
if (sourceFile == null)
|
||||
{
|
||||
throw new FileNotFoundException("No quick." + ext + " file found for test");
|
||||
}
|
||||
|
||||
// construct a reader onto the source file
|
||||
ContentReader sourceReader = new FileContentReader(sourceFile);
|
||||
sourceReader.setMimetype(mimetype);
|
||||
getExtracter().extract(sourceReader, destination);
|
||||
return destination;
|
||||
}
|
||||
|
||||
public void testCommonMetadata(Map<QName, Serializable> destination)
|
||||
{
|
||||
assertEquals(QUICK_TITLE, destination.get(ContentModel.PROP_TITLE));
|
||||
assertEquals(QUICK_DESCRIPTION, destination.get(ContentModel.PROP_DESCRIPTION));
|
||||
assertEquals(QUICK_CREATOR, destination.get(ContentModel.PROP_CREATOR));
|
||||
}
|
||||
}
|
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.swing.text.ChangedCharSetException;
|
||||
import javax.swing.text.MutableAttributeSet;
|
||||
import javax.swing.text.html.HTML;
|
||||
import javax.swing.text.html.HTMLEditorKit;
|
||||
import javax.swing.text.html.parser.ParserDelegator;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class HtmlMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(HtmlMetadataExtracter.class);
|
||||
|
||||
public HtmlMetadataExtracter()
|
||||
{
|
||||
super(MimetypeMap.MIMETYPE_HTML, 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
final Map<QName, Serializable> tempDestination = new HashMap<QName, Serializable>();
|
||||
try
|
||||
{
|
||||
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback()
|
||||
{
|
||||
StringBuffer title = null;
|
||||
boolean inHead = false;
|
||||
|
||||
public void handleText(char[] data, int pos)
|
||||
{
|
||||
if (title != null)
|
||||
{
|
||||
title.append(data);
|
||||
}
|
||||
}
|
||||
|
||||
public void handleComment(char[] data, int pos)
|
||||
{
|
||||
// Perhaps sniff for Office 9+ metadata in here?
|
||||
}
|
||||
|
||||
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
|
||||
{
|
||||
if (HTML.Tag.HEAD.equals(t))
|
||||
{
|
||||
inHead = true;
|
||||
}
|
||||
else if (HTML.Tag.TITLE.equals(t) && inHead)
|
||||
{
|
||||
title = new StringBuffer();
|
||||
}
|
||||
else
|
||||
handleSimpleTag(t, a, pos);
|
||||
}
|
||||
|
||||
public void handleEndTag(HTML.Tag t, int pos)
|
||||
{
|
||||
if (HTML.Tag.HEAD.equals(t))
|
||||
{
|
||||
inHead = false;
|
||||
}
|
||||
else if (HTML.Tag.TITLE.equals(t))
|
||||
{
|
||||
trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination);
|
||||
title = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
|
||||
{
|
||||
if (HTML.Tag.META.equals(t))
|
||||
{
|
||||
Object nameO = a.getAttribute(HTML.Attribute.NAME);
|
||||
Object valueO = a.getAttribute(HTML.Attribute.CONTENT);
|
||||
if (nameO == null || valueO == null)
|
||||
return;
|
||||
|
||||
String name = nameO.toString();
|
||||
|
||||
if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author")
|
||||
|| name.equalsIgnoreCase("dc.creator"))
|
||||
{
|
||||
trimPut(ContentModel.PROP_CREATOR, valueO, tempDestination);
|
||||
}
|
||||
if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description"))
|
||||
{
|
||||
trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void handleError(String errorMsg, int pos)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
String charsetGuess = "UTF-8";
|
||||
int tries = 0;
|
||||
while (tries < 3)
|
||||
{
|
||||
tempDestination.clear();
|
||||
Reader r = null;
|
||||
InputStream cis = null;
|
||||
try
|
||||
{
|
||||
cis = reader.getContentInputStream();
|
||||
// TODO: for now, use default charset; we should attempt to map from html meta-data
|
||||
r = new InputStreamReader(cis);
|
||||
HTMLEditorKit.Parser parser = new ParserDelegator();
|
||||
parser.parse(r, callback, tries > 0);
|
||||
destination.putAll(tempDestination);
|
||||
break;
|
||||
}
|
||||
catch (ChangedCharSetException ccse)
|
||||
{
|
||||
tries++;
|
||||
charsetGuess = ccse.getCharSetSpec();
|
||||
int begin = charsetGuess.indexOf("charset=");
|
||||
if (begin > 0)
|
||||
charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length());
|
||||
reader = reader.getReader();
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (r != null)
|
||||
r.close();
|
||||
if (cis != null)
|
||||
cis.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new ContentIOException("HTML metadata extraction failed: \n" + " reader: " + reader, e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class HtmlMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(HtmlMetadataExtracterTest.class);
|
||||
private MetadataExtracter extracter;
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
{
|
||||
extracter = new HtmlMetadataExtracter();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testReliability() throws Exception
|
||||
{
|
||||
double reliability = 0.0;
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertEquals("Mimetype text should not be supported", 0.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_HTML);
|
||||
assertEquals("HTML should be supported", 1.0, reliability);
|
||||
}
|
||||
|
||||
public void testHtmlExtraction() throws Exception
|
||||
{
|
||||
testCommonMetadata(extractFromExtension("html", MimetypeMap.MIMETYPE_HTML));
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,245 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Alfresco, Inc.
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.alfresco.util.GUID;
|
||||
import org.farng.mp3.AbstractMP3FragmentBody;
|
||||
import org.farng.mp3.MP3File;
|
||||
import org.farng.mp3.TagException;
|
||||
import org.farng.mp3.id3.AbstractID3v2;
|
||||
import org.farng.mp3.id3.AbstractID3v2Frame;
|
||||
import org.farng.mp3.id3.ID3v1;
|
||||
import org.farng.mp3.lyrics3.AbstractLyrics3;
|
||||
import org.farng.mp3.lyrics3.Lyrics3v2;
|
||||
import org.farng.mp3.lyrics3.Lyrics3v2Field;
|
||||
|
||||
/**
|
||||
* @author Roy Wetherall
|
||||
*/
|
||||
public class MP3MetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
private static final QName PROP_ALBUM_TITLE = QName.createQName("{music}albumTitle");
|
||||
private static final QName PROP_SONG_TITLE = QName.createQName("{music}songTitle");;
|
||||
private static final QName PROP_ARTIST = QName.createQName("{music}artist");;
|
||||
private static final QName PROP_COMMENT = QName.createQName("{music}comment");;
|
||||
private static final QName PROP_YEAR_RELEASED = QName.createQName("{music}yearReleased");;
|
||||
private static final QName PROP_TRACK_NUMBER = QName.createQName("{music}trackNumber");;
|
||||
private static final QName PROP_GENRE = QName.createQName("{music}genre");;
|
||||
private static final QName PROP_COMPOSER = QName.createQName("{music}composer");;
|
||||
private static final QName PROP_LYRICS = QName.createQName("{music}lyrics");;
|
||||
|
||||
public MP3MetadataExtracter()
|
||||
{
|
||||
super(MimetypeMap.MIMETYPE_MP3, 1.0, 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.metadata.MetadataExtracter#extract(org.alfresco.service.cmr.repository.ContentReader, java.util.Map)
|
||||
*/
|
||||
public void extract(ContentReader reader,
|
||||
Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
try
|
||||
{
|
||||
Map<QName, Serializable> props = new HashMap<QName, Serializable>();
|
||||
|
||||
// Create a temp file
|
||||
File tempFile = File.createTempFile(GUID.generate(), ".tmp");
|
||||
try
|
||||
{
|
||||
reader.getContent(tempFile);
|
||||
|
||||
// Create the MP3 object from the file
|
||||
MP3File mp3File = new MP3File(tempFile);
|
||||
|
||||
ID3v1 id3v1 = mp3File.getID3v1Tag();
|
||||
if (id3v1 != null)
|
||||
{
|
||||
setTagValue(props, PROP_ALBUM_TITLE, id3v1.getAlbum());
|
||||
setTagValue(props, PROP_SONG_TITLE, id3v1.getTitle());
|
||||
setTagValue(props, PROP_ARTIST, id3v1.getArtist());
|
||||
setTagValue(props, PROP_COMMENT, id3v1.getComment());
|
||||
setTagValue(props, PROP_YEAR_RELEASED, id3v1.getYear());
|
||||
|
||||
// TODO sort out the genre
|
||||
//setTagValue(props, MusicModel.PROP_GENRE, id3v1.getGenre());
|
||||
|
||||
// TODO sort out the size
|
||||
//setTagValue(props, MusicModel.PROP_SIZE, id3v1.getSize());
|
||||
}
|
||||
|
||||
AbstractID3v2 id3v2 = mp3File.getID3v2Tag();
|
||||
if (id3v2 != null)
|
||||
{
|
||||
setTagValue(props, PROP_SONG_TITLE, getID3V2Value(id3v2, "TIT2"));
|
||||
setTagValue(props, PROP_ARTIST, getID3V2Value(id3v2, "TPE1"));
|
||||
setTagValue(props, PROP_ALBUM_TITLE, getID3V2Value(id3v2, "TALB"));
|
||||
setTagValue(props, PROP_YEAR_RELEASED, getID3V2Value(id3v2, "TDRC"));
|
||||
setTagValue(props, PROP_COMMENT, getID3V2Value(id3v2, "COMM"));
|
||||
setTagValue(props, PROP_TRACK_NUMBER, getID3V2Value(id3v2, "TRCK"));
|
||||
setTagValue(props, PROP_GENRE, getID3V2Value(id3v2, "TCON"));
|
||||
setTagValue(props, PROP_COMPOSER, getID3V2Value(id3v2, "TCOM"));
|
||||
|
||||
// TODO sort out the lyrics
|
||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "SYLT"));
|
||||
//System.out.println("Lyrics: " + getID3V2Value(id3v2, "USLT"));
|
||||
}
|
||||
|
||||
AbstractLyrics3 lyrics3Tag = mp3File.getLyrics3Tag();
|
||||
if (lyrics3Tag != null)
|
||||
{
|
||||
System.out.println("Lyrics3 tag found.");
|
||||
if (lyrics3Tag instanceof Lyrics3v2)
|
||||
{
|
||||
setTagValue(props, PROP_SONG_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TIT2"));
|
||||
setTagValue(props, PROP_ARTIST, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TPE1"));
|
||||
setTagValue(props, PROP_ALBUM_TITLE, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TALB"));
|
||||
setTagValue(props, PROP_COMMENT, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "COMM"));
|
||||
setTagValue(props, PROP_LYRICS, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "SYLT"));
|
||||
setTagValue(props, PROP_COMPOSER, getLyrics3v2Value((Lyrics3v2)lyrics3Tag, "TCOM"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
finally
|
||||
{
|
||||
tempFile.delete();
|
||||
}
|
||||
|
||||
// Set the destination values
|
||||
if (props.get(PROP_SONG_TITLE) != null)
|
||||
{
|
||||
destination.put(ContentModel.PROP_TITLE, props.get(PROP_SONG_TITLE));
|
||||
}
|
||||
if (props.get(PROP_ARTIST) != null)
|
||||
{
|
||||
destination.put(ContentModel.PROP_CREATOR, props.get(PROP_ARTIST));
|
||||
}
|
||||
String description = getDescription(props);
|
||||
if (description != null)
|
||||
{
|
||||
destination.put(ContentModel.PROP_DESCRIPTION, description);
|
||||
}
|
||||
}
|
||||
catch (IOException ioException)
|
||||
{
|
||||
// TODO sort out exception handling
|
||||
throw new RuntimeException("Error reading mp3 file.", ioException);
|
||||
}
|
||||
catch (TagException tagException)
|
||||
{
|
||||
// TODO sort out exception handling
|
||||
throw new RuntimeException("Error reading mp3 tag information.", tagException);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generate the description
|
||||
*
|
||||
* @param props the properties extracted from the file
|
||||
* @return the description
|
||||
*/
|
||||
private String getDescription(Map<QName, Serializable> props)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (props.get(PROP_SONG_TITLE) != null && props.get(PROP_ARTIST) != null && props.get(PROP_ALBUM_TITLE) != null)
|
||||
{
|
||||
result
|
||||
.append(props.get(PROP_SONG_TITLE))
|
||||
.append(" - ")
|
||||
.append(props.get(PROP_ALBUM_TITLE))
|
||||
.append(" (")
|
||||
.append(props.get(PROP_ARTIST))
|
||||
.append(")");
|
||||
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param props
|
||||
* @param propQName
|
||||
* @param propvalue
|
||||
*/
|
||||
private void setTagValue(Map<QName, Serializable> props, QName propQName, String propvalue)
|
||||
{
|
||||
if (propvalue != null && propvalue.length() != 0)
|
||||
{
|
||||
trimPut(propQName, propvalue, props);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param lyrics3Tag
|
||||
* @param name
|
||||
* @return
|
||||
*/
|
||||
private String getLyrics3v2Value(Lyrics3v2 lyrics3Tag, String name)
|
||||
{
|
||||
String result = "";
|
||||
Lyrics3v2Field field = lyrics3Tag.getField(name);
|
||||
if (field != null)
|
||||
{
|
||||
AbstractMP3FragmentBody body = field.getBody();
|
||||
if (body != null)
|
||||
{
|
||||
result = (String)body.getObject("Text");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ID3V2 tag value in a safe way
|
||||
*
|
||||
* @param id3v2
|
||||
* @param name
|
||||
* @return
|
||||
*/
|
||||
private String getID3V2Value(AbstractID3v2 id3v2, String name)
|
||||
{
|
||||
String result = "";
|
||||
|
||||
AbstractID3v2Frame frame = id3v2.getFrame(name);
|
||||
if (frame != null)
|
||||
{
|
||||
AbstractMP3FragmentBody body = frame.getBody();
|
||||
if (body != null)
|
||||
{
|
||||
result = (String)body.getObject("Text");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public interface MetadataExtracter
|
||||
{
|
||||
/**
|
||||
* Provides the approximate accuracy with which this extracter can extract
|
||||
* metadata for the mimetype.
|
||||
* <p>
|
||||
*
|
||||
* @param sourceMimetype the source mimetype
|
||||
* @return Returns a score 0.0 to 1.0. 0.0 indicates that the extraction
|
||||
* cannot be performed at all. 1.0 indicates that the extraction can
|
||||
* be performed perfectly.
|
||||
*/
|
||||
public double getReliability(String sourceMimetype);
|
||||
|
||||
/**
|
||||
* Provides an estimate, usually a worst case guess, of how long an
|
||||
* extraction will take.
|
||||
* <p>
|
||||
* This method is used to determine, up front, which of a set of equally
|
||||
* reliant transformers will be used for a specific extraction.
|
||||
*
|
||||
* @return Returns the approximate number of milliseconds per transformation
|
||||
*/
|
||||
public long getExtractionTime();
|
||||
|
||||
/**
|
||||
* Extracts the metadata from the content provided by the reader and source
|
||||
* mimetype to the supplied map.
|
||||
* <p>
|
||||
* The extraction viability can be determined by an up front call to
|
||||
* {@link #getReliability(String)}.
|
||||
* <p>
|
||||
* The source mimetype <b>must</b> be available on the
|
||||
* {@link org.alfresco.service.cmr.repository.ContentAccessor#getMimetype()} method
|
||||
* of the reader.
|
||||
*
|
||||
* @param reader the source of the content
|
||||
* @param destination the destination of the extraction
|
||||
* @throws ContentIOException if an IO exception occurs
|
||||
*/
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException;
|
||||
|
||||
}
|
@@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
import org.alfresco.error.AlfrescoRuntimeException;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.util.Assert;
|
||||
|
||||
/**
|
||||
* Holds and provides the most appropriate metadate extracter for a particular
|
||||
* mimetype.
|
||||
* <p>
|
||||
* The extracters themselves know how well they are able to extract metadata.
|
||||
*
|
||||
* @see org.alfresco.repo.content.metadata.MetadataExtracter
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class MetadataExtracterRegistry
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(MetadataExtracterRegistry.class);
|
||||
|
||||
private List<MetadataExtracter> extracters;
|
||||
private Map<String, MetadataExtracter> extracterCache;
|
||||
|
||||
private MimetypeMap mimetypeMap;
|
||||
/** Controls read access to the cache */
|
||||
private Lock extracterCacheReadLock;
|
||||
/** controls write access to the cache */
|
||||
private Lock extracterCacheWriteLock;
|
||||
|
||||
/**
|
||||
* @param mimetypeMap all the mimetypes available to the system
|
||||
*/
|
||||
public MetadataExtracterRegistry(MimetypeMap mimetypeMap)
|
||||
{
|
||||
Assert.notNull(mimetypeMap, "The MimetypeMap is mandatory");
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
|
||||
extracters = Collections.emptyList(); // just in case it isn't set
|
||||
extracterCache = new HashMap<String, MetadataExtracter>(17);
|
||||
|
||||
// create lock objects for access to the cache
|
||||
ReadWriteLock extractionCacheLock = new ReentrantReadWriteLock();
|
||||
extracterCacheReadLock = extractionCacheLock.readLock();
|
||||
extracterCacheWriteLock = extractionCacheLock.writeLock();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the best metadata extracter. This is a combination of the most
|
||||
* reliable and the most performant extracter.
|
||||
* <p>
|
||||
* The result is cached for quicker access next time.
|
||||
*
|
||||
* @param mimetype the source MIME of the extraction
|
||||
* @return Returns a metadata extracter that can extract metadata from the
|
||||
* chosen MIME type.
|
||||
*/
|
||||
public MetadataExtracter getExtracter(String sourceMimetype)
|
||||
{
|
||||
// check that the mimetypes are valid
|
||||
if (!mimetypeMap.getMimetypes().contains(sourceMimetype))
|
||||
{
|
||||
throw new AlfrescoRuntimeException("Unknown extraction source mimetype: " + sourceMimetype);
|
||||
}
|
||||
|
||||
MetadataExtracter extracter = null;
|
||||
extracterCacheReadLock.lock();
|
||||
try
|
||||
{
|
||||
if (extracterCache.containsKey(sourceMimetype))
|
||||
{
|
||||
// the translation has been requested before
|
||||
// it might have been null
|
||||
return extracterCache.get(sourceMimetype);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
extracterCacheReadLock.unlock();
|
||||
}
|
||||
|
||||
// the translation has not been requested before
|
||||
// get a write lock on the cache
|
||||
// no double check done as it is not an expensive task
|
||||
extracterCacheWriteLock.lock();
|
||||
try
|
||||
{
|
||||
// find the most suitable transformer - may be empty list
|
||||
extracter = findBestExtracter(sourceMimetype);
|
||||
// store the result even if it is null
|
||||
extracterCache.put(sourceMimetype, extracter);
|
||||
return extracter;
|
||||
}
|
||||
finally
|
||||
{
|
||||
extracterCacheWriteLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param sourceMimetype The MIME type under examination
|
||||
* @return The fastest of the most reliable extracters in
|
||||
* <code>extracters</code> for the given MIME type.
|
||||
*/
|
||||
private MetadataExtracter findBestExtracter(String sourceMimetype)
|
||||
{
|
||||
double bestReliability = -1;
|
||||
long bestTime = Long.MAX_VALUE;
|
||||
logger.debug("Finding best extracter for " + sourceMimetype);
|
||||
|
||||
MetadataExtracter bestExtracter = null;
|
||||
|
||||
for (MetadataExtracter ext : extracters)
|
||||
{
|
||||
double r = ext.getReliability(sourceMimetype);
|
||||
if (r == bestReliability)
|
||||
{
|
||||
long time = ext.getExtractionTime();
|
||||
if (time < bestTime)
|
||||
{
|
||||
bestExtracter = ext;
|
||||
bestTime = time;
|
||||
}
|
||||
}
|
||||
else if (r > bestReliability)
|
||||
{
|
||||
bestExtracter = ext;
|
||||
bestReliability = r;
|
||||
bestTime = ext.getExtractionTime();
|
||||
}
|
||||
}
|
||||
return bestExtracter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a list of self-discovering extracters.
|
||||
*
|
||||
* @param transformers all the available extracters that the registry can
|
||||
* work with
|
||||
*/
|
||||
public void setExtracters(List<MetadataExtracter> extracters)
|
||||
{
|
||||
logger.debug("Setting " + extracters.size() + "new extracters.");
|
||||
|
||||
extracterCacheWriteLock.lock();
|
||||
try
|
||||
{
|
||||
this.extracters = extracters;
|
||||
this.extracterCache.clear();
|
||||
}
|
||||
finally
|
||||
{
|
||||
extracterCacheWriteLock.unlock();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.PropertySet;
|
||||
import org.apache.poi.hpsf.PropertySetFactory;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
|
||||
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class OfficeMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(OfficeMetadataExtracter.class);
|
||||
private static String[] mimeTypes = new String[] { MimetypeMap.MIMETYPE_WORD, MimetypeMap.MIMETYPE_EXCEL,
|
||||
MimetypeMap.MIMETYPE_PPT };
|
||||
|
||||
public OfficeMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, final Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
POIFSReaderListener readerListener = new POIFSReaderListener()
|
||||
{
|
||||
public void processPOIFSReaderEvent(final POIFSReaderEvent event)
|
||||
{
|
||||
try
|
||||
{
|
||||
PropertySet ps = PropertySetFactory.create(event.getStream());
|
||||
if (ps instanceof SummaryInformation)
|
||||
{
|
||||
SummaryInformation si = (SummaryInformation) ps;
|
||||
// Titled aspect
|
||||
trimPut(ContentModel.PROP_TITLE, si.getTitle(), destination);
|
||||
trimPut(ContentModel.PROP_DESCRIPTION, si.getSubject(), destination);
|
||||
|
||||
// Auditable aspect
|
||||
trimPut(ContentModel.PROP_CREATED, si.getCreateDateTime(), destination);
|
||||
trimPut(ContentModel.PROP_CREATOR, si.getAuthor(), destination);
|
||||
trimPut(ContentModel.PROP_MODIFIED, si.getLastSaveDateTime(), destination);
|
||||
trimPut(ContentModel.PROP_MODIFIER, si.getLastAuthor(), destination);
|
||||
}
|
||||
else if (ps instanceof DocumentSummaryInformation)
|
||||
{
|
||||
DocumentSummaryInformation dsi = (DocumentSummaryInformation) ps;
|
||||
|
||||
// These are not really interesting to any aspect:
|
||||
// trimPut(ContentModel.PROP_xxx, dsi.getCompany(),
|
||||
// destination);
|
||||
// trimPut(ContentModel.PROP_yyy, dsi.getManager(),
|
||||
// destination);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
throw new ContentIOException("Property set stream: " + event.getPath() + event.getName(), ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
try
|
||||
{
|
||||
POIFSReader r = new POIFSReader();
|
||||
r.registerListener(readerListener, SummaryInformation.DEFAULT_STREAM_NAME);
|
||||
r.read(reader.getContentInputStream());
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new ContentIOException("Compound Document SummaryInformation metadata extraction failed: \n"
|
||||
+ " reader: " + reader,
|
||||
e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,60 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.OfficeMetadataExtracter
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class OfficeMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(OfficeMetadataExtracterTest.class);
|
||||
private MetadataExtracter extracter;
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
{
|
||||
extracter = new OfficeMetadataExtracter();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testReliability() throws Exception
|
||||
{
|
||||
double reliability = 0.0;
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertEquals("Mimetype text should not be supported", 0.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_WORD);
|
||||
assertEquals("Word should be supported", 1.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_EXCEL);
|
||||
assertEquals("Excel should be supported", 1.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_PPT);
|
||||
assertEquals("PowerPoint should be supported", 1.0, reliability);
|
||||
}
|
||||
|
||||
public void testWordExtraction() throws Exception
|
||||
{
|
||||
testCommonMetadata(extractFromExtension("doc", MimetypeMap.MIMETYPE_WORD));
|
||||
}
|
||||
|
||||
public void testExcelExtraction() throws Exception
|
||||
{
|
||||
testCommonMetadata(extractFromExtension("xls", MimetypeMap.MIMETYPE_EXCEL));
|
||||
}
|
||||
|
||||
public void testPowerPointExtraction() throws Exception
|
||||
{
|
||||
testCommonMetadata(extractFromExtension("ppt", MimetypeMap.MIMETYPE_PPT));
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Antti Jokipii
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.catcode.odf.ODFMetaFileAnalyzer;
|
||||
import com.catcode.odf.OpenDocumentMetadata;
|
||||
|
||||
/**
|
||||
* Metadata extractor for the
|
||||
* {@link org.alfresco.repo.content.MimetypeMap#MIMETYPE_OPENDOCUMENT_TEXT MIMETYPE_OPENDOCUMENT_XXX}
|
||||
* mimetypes.
|
||||
*
|
||||
* @author Antti Jokipii
|
||||
*/
|
||||
public class OpenDocumentMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(OpenDocumentMetadataExtracter.class);
|
||||
|
||||
private static String[] mimeTypes = new String[] {
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_GRAPHICS,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_GRAPHICS_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_PRESENTATION_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_SPREADSHEET,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_SPREADSHEET_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_CHART,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_CHART_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_IMAGE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_IMAGE_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_FORMULA,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_MASTER,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT_WEB,
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_DATABASE, };
|
||||
|
||||
public OpenDocumentMetadataExtracter()
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
ODFMetaFileAnalyzer analyzer = new ODFMetaFileAnalyzer();
|
||||
try
|
||||
{
|
||||
// stream the document in
|
||||
OpenDocumentMetadata docInfo = analyzer.analyzeZip(reader.getContentInputStream());
|
||||
|
||||
if (docInfo != null)
|
||||
{
|
||||
// set the metadata
|
||||
destination.put(ContentModel.PROP_CREATOR, docInfo.getCreator());
|
||||
destination.put(ContentModel.PROP_TITLE, docInfo.getTitle());
|
||||
destination.put(ContentModel.PROP_DESCRIPTION, docInfo.getDescription());
|
||||
destination.put(ContentModel.PROP_CREATED, docInfo.getCreationDate());
|
||||
}
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
String message = "Metadata extraction failed: \n" +
|
||||
" reader: " + reader;
|
||||
logger.debug(message, e);
|
||||
throw new ContentIOException(message, e);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.pdfbox.pdmodel.PDDocument;
|
||||
import org.pdfbox.pdmodel.PDDocumentInformation;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class PdfBoxMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracter.class);
|
||||
|
||||
public PdfBoxMetadataExtracter()
|
||||
{
|
||||
super(MimetypeMap.MIMETYPE_PDF, 1.0, 1000);
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
if (!MimetypeMap.MIMETYPE_PDF.equals(reader.getMimetype()))
|
||||
{
|
||||
logger.debug("No metadata extracted for " + reader.getMimetype());
|
||||
return;
|
||||
}
|
||||
PDDocument pdf = null;
|
||||
try
|
||||
{
|
||||
// stream the document in
|
||||
pdf = PDDocument.load(reader.getContentInputStream());
|
||||
// Scoop out the metadata
|
||||
PDDocumentInformation docInfo = pdf.getDocumentInformation();
|
||||
|
||||
trimPut(ContentModel.PROP_CREATOR, docInfo.getAuthor(), destination);
|
||||
trimPut(ContentModel.PROP_TITLE, docInfo.getTitle(), destination);
|
||||
trimPut(ContentModel.PROP_DESCRIPTION, docInfo.getSubject(), destination);
|
||||
|
||||
Calendar created = docInfo.getCreationDate();
|
||||
if (created != null)
|
||||
destination.put(ContentModel.PROP_CREATED, created.getTime());
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
throw new ContentIOException("PDF metadata extraction failed: \n" +
|
||||
" reader: " + reader);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (pdf != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
pdf.close();
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,43 @@
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.PdfBoxContentTransformer
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class PdfBoxMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private static final Log logger = LogFactory.getLog(PdfBoxMetadataExtracterTest.class);
|
||||
private MetadataExtracter extracter;
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
{
|
||||
extracter = new PdfBoxMetadataExtracter();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same transformer regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testReliability() throws Exception
|
||||
{
|
||||
double reliability = 0.0;
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertEquals("Mimetype should not be supported", 0.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_PDF);
|
||||
assertEquals("Mimetype should be supported", 1.0, reliability);
|
||||
}
|
||||
|
||||
public void testPdfExtraction() throws Exception
|
||||
{
|
||||
testCommonMetadata(extractFromExtension("pdf", MimetypeMap.MIMETYPE_PDF));
|
||||
}
|
||||
}
|
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class StringMetadataExtracter implements MetadataExtracter
|
||||
{
|
||||
public static final String PREFIX_TEXT = "text/";
|
||||
|
||||
private static final Log logger = LogFactory.getLog(StringMetadataExtracter.class);
|
||||
|
||||
public double getReliability(String sourceMimetype)
|
||||
{
|
||||
if (sourceMimetype.startsWith(PREFIX_TEXT))
|
||||
return 0.1;
|
||||
else
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
public long getExtractionTime()
|
||||
{
|
||||
return 1000;
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug("No metadata extracted for " + reader.getMimetype());
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.Serializable;
|
||||
import java.net.ConnectException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import net.sf.joott.uno.UnoConnection;
|
||||
|
||||
import org.alfresco.model.ContentModel;
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
import org.alfresco.service.cmr.repository.ContentIOException;
|
||||
import org.alfresco.service.cmr.repository.ContentReader;
|
||||
import org.alfresco.service.namespace.QName;
|
||||
import org.alfresco.util.TempFileProvider;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.sun.star.beans.PropertyValue;
|
||||
import com.sun.star.beans.XPropertySet;
|
||||
import com.sun.star.document.XDocumentInfoSupplier;
|
||||
import com.sun.star.frame.XComponentLoader;
|
||||
import com.sun.star.lang.XComponent;
|
||||
import com.sun.star.ucb.XFileIdentifierConverter;
|
||||
import com.sun.star.uno.UnoRuntime;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class UnoMetadataExtracter extends AbstractMetadataExtracter
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(UnoMetadataExtracter.class);
|
||||
|
||||
private static String[] mimeTypes = new String[] {
|
||||
MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT,
|
||||
MimetypeMap.MIMETYPE_OPENOFFICE_WRITER,
|
||||
// Add the other OpenOffice.org stuff here
|
||||
// In fact, other types may apply as well, but should be counted as lower
|
||||
// quality since they involve conversion.
|
||||
};
|
||||
|
||||
public UnoMetadataExtracter(MimetypeMap mimetypeMap, String connectionUrl)
|
||||
{
|
||||
super(new HashSet<String>(Arrays.asList(mimeTypes)), 1.00, 10000);
|
||||
this.mimetypeMap = mimetypeMap;
|
||||
init(connectionUrl);
|
||||
}
|
||||
|
||||
public UnoMetadataExtracter(MimetypeMap mimetypeMap)
|
||||
{
|
||||
this(mimetypeMap, UnoConnection.DEFAULT_CONNECTION_STRING);
|
||||
}
|
||||
|
||||
private MimetypeMap mimetypeMap;
|
||||
private MyUnoConnection connection;
|
||||
private boolean isConnected;
|
||||
|
||||
/**
|
||||
* @param unoConnectionUrl the URL of the Uno server
|
||||
*/
|
||||
private synchronized void init(String unoConnectionUrl)
|
||||
{
|
||||
connection = new MyUnoConnection(unoConnectionUrl);
|
||||
// attempt to make an connection
|
||||
try
|
||||
{
|
||||
connection.connect();
|
||||
isConnected = true;
|
||||
}
|
||||
catch (ConnectException e)
|
||||
{
|
||||
isConnected = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns true if a connection to the Uno server could be
|
||||
* established
|
||||
*/
|
||||
public boolean isConnected()
|
||||
{
|
||||
return isConnected;
|
||||
}
|
||||
|
||||
public void extract(ContentReader reader, final Map<QName, Serializable> destination) throws ContentIOException
|
||||
{
|
||||
String sourceMimetype = reader.getMimetype();
|
||||
|
||||
// create temporary files to convert from and to
|
||||
File tempFromFile = TempFileProvider.createTempFile("UnoContentTransformer", "."
|
||||
+ mimetypeMap.getExtension(sourceMimetype));
|
||||
// download the content from the source reader
|
||||
reader.getContent(tempFromFile);
|
||||
String sourceUrl = tempFromFile.toString();
|
||||
try
|
||||
{
|
||||
sourceUrl = toUrl(tempFromFile, connection);
|
||||
|
||||
// UNO Interprocess Bridge *should* be thread-safe, but...
|
||||
synchronized (connection)
|
||||
{
|
||||
XComponentLoader desktop = connection.getDesktop();
|
||||
XComponent document = desktop.loadComponentFromURL(
|
||||
sourceUrl,
|
||||
"_blank",
|
||||
0,
|
||||
new PropertyValue[] { property("Hidden", Boolean.TRUE) });
|
||||
if (document == null)
|
||||
{
|
||||
throw new FileNotFoundException("could not open source document: " + sourceUrl);
|
||||
}
|
||||
try
|
||||
{
|
||||
XDocumentInfoSupplier infoSupplier = (XDocumentInfoSupplier) UnoRuntime.queryInterface(
|
||||
XDocumentInfoSupplier.class, document);
|
||||
XPropertySet propSet = (XPropertySet) UnoRuntime.queryInterface(
|
||||
XPropertySet.class,
|
||||
infoSupplier
|
||||
.getDocumentInfo());
|
||||
|
||||
// Titled aspect
|
||||
trimPut(ContentModel.PROP_TITLE, propSet.getPropertyValue("Title"), destination);
|
||||
trimPut(ContentModel.PROP_DESCRIPTION, propSet.getPropertyValue("Subject"), destination);
|
||||
|
||||
// Auditable aspect
|
||||
// trimPut(ContentModel.PROP_CREATED,
|
||||
// si.getCreateDateTime(), destination);
|
||||
trimPut(ContentModel.PROP_CREATOR, propSet.getPropertyValue("Author"), destination);
|
||||
// trimPut(ContentModel.PROP_MODIFIED,
|
||||
// si.getLastSaveDateTime(), destination);
|
||||
// trimPut(ContentModel.PROP_MODIFIER, si.getLastAuthor(),
|
||||
// destination);
|
||||
}
|
||||
finally
|
||||
{
|
||||
document.dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Throwable e)
|
||||
{
|
||||
throw new ContentIOException("Conversion failed: \n" +
|
||||
" source: " + sourceUrl + "\n",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
public String toUrl(File file, MyUnoConnection connection) throws ConnectException
|
||||
{
|
||||
Object fcp = connection.getFileContentService();
|
||||
XFileIdentifierConverter fic = (XFileIdentifierConverter) UnoRuntime.queryInterface(
|
||||
XFileIdentifierConverter.class, fcp);
|
||||
return fic.getFileURLFromSystemPath("", file.getAbsolutePath());
|
||||
}
|
||||
|
||||
public double getReliability(String sourceMimetype)
|
||||
{
|
||||
if (isConnected())
|
||||
return super.getReliability(sourceMimetype);
|
||||
else
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
private static PropertyValue property(String name, Object value)
|
||||
{
|
||||
PropertyValue property = new PropertyValue();
|
||||
property.Name = name;
|
||||
property.Value = value;
|
||||
return property;
|
||||
}
|
||||
|
||||
static class MyUnoConnection extends UnoConnection
|
||||
{
|
||||
public MyUnoConnection(String url)
|
||||
{
|
||||
super(url);
|
||||
}
|
||||
|
||||
public Object getFileContentService() throws ConnectException
|
||||
{
|
||||
return getService("com.sun.star.ucb.FileContentProvider");
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (C) 2005 Jesper Steen M<>ller
|
||||
*
|
||||
* Licensed under the Mozilla Public License version 1.1
|
||||
* with a permitted attribution clause. You may obtain a
|
||||
* copy of the License at
|
||||
*
|
||||
* http://www.alfresco.org/legal/license.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific
|
||||
* language governing permissions and limitations under the
|
||||
* License.
|
||||
*/
|
||||
package org.alfresco.repo.content.metadata;
|
||||
|
||||
import org.alfresco.repo.content.MimetypeMap;
|
||||
|
||||
/**
|
||||
* @see org.alfresco.repo.content.transform.UnoMetadataExtracter
|
||||
* @author Jesper Steen M<>ller
|
||||
*/
|
||||
public class UnoMetadataExtracterTest extends AbstractMetadataExtracterTest
|
||||
{
|
||||
private UnoMetadataExtracter extracter;
|
||||
|
||||
public void onSetUpInTransaction() throws Exception
|
||||
{
|
||||
extracter = new UnoMetadataExtracter(mimetypeMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the same extracter regardless - it is allowed
|
||||
*/
|
||||
protected MetadataExtracter getExtracter()
|
||||
{
|
||||
return extracter;
|
||||
}
|
||||
|
||||
public void testReliability() throws Exception
|
||||
{
|
||||
if (!extracter.isConnected())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
double reliability = 0.0;
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_TEXT_PLAIN);
|
||||
assertEquals("Mimetype text should not be supported", 0.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT);
|
||||
assertEquals("OpenOffice 2.0 Writer (OpenDoc) should be supported", 1.0, reliability);
|
||||
|
||||
reliability = extracter.getReliability(MimetypeMap.MIMETYPE_OPENOFFICE_WRITER);
|
||||
assertEquals("OpenOffice 1.0 Writer should be supported", 1.0, reliability);
|
||||
}
|
||||
|
||||
public void testOOo20WriterExtraction() throws Exception
|
||||
{
|
||||
if (!extracter.isConnected())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
testCommonMetadata(extractFromExtension("odt", MimetypeMap.MIMETYPE_OPENDOCUMENT_TEXT));
|
||||
}
|
||||
|
||||
public void testOOo10WriterExtraction() throws Exception
|
||||
{
|
||||
if (!extracter.isConnected())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
testCommonMetadata(extractFromExtension("sxw", MimetypeMap.MIMETYPE_OPENOFFICE_WRITER));
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user