/*
Copyright (C) 2011 maik.jablonski@jease.org
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package jease.cms.domain;
import java.io.IOException;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
/**
* A File which allows to extract the content as plain text and is therefore
* indexable for full text search.
*/
public class Document extends File {
private String text;
private long lastTextUpdate;
public Document() {
}
public StringBuilder getFulltext() {
return super.getFulltext().append("\n").append(getText());
}
public String getText() {
if (text == null || lastTextUpdate < getFile().lastModified()) {
try {
Tika tika = new Tika();
tika.setMaxStringLength(-1);
text = tika.parseToString(getFile()).replaceAll("\n\\s*\n+","\n\n");
} catch (TikaException e) {
text = "";
} catch (IOException e) {
text = "";
} finally {
lastTextUpdate = getFile().lastModified();
}
}
return text;
}
public Document copy(boolean recursive) {
Document document = (Document) super.copy(recursive);
// Trigger conversion to plain text
document.getText();
return document;
}
}