Package jease.cms.domain

Source Code of jease.cms.domain.Document

/*
    Copyright (C) 2011 maik.jablonski@jease.org

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
package jease.cms.domain;

import java.io.IOException;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

/**
* A File which allows to extract the content as plain text and is therefore
* indexable for full text search.
*/
public class Document extends File {

  private String text;
  private long lastTextUpdate;

  public Document() {
  }

  public StringBuilder getFulltext() {
    return super.getFulltext().append("\n").append(getText());
  }

  public String getText() {
    if (text == null || lastTextUpdate < getFile().lastModified()) {
      try {
        Tika tika = new Tika();
        tika.setMaxStringLength(-1);
        text = tika.parseToString(getFile()).replaceAll("\n\\s*\n+","\n\n");
      } catch (TikaException e) {
        text = "";
      } catch (IOException e) {
        text = "";
      } finally {
        lastTextUpdate = getFile().lastModified();
      }
    }
    return text;
  }

  public Document copy(boolean recursive) {
    Document document = (Document) super.copy(recursive);
    // Trigger conversion to plain text
    document.getText();
    return document;
  }
}
TOP

Related Classes of jease.cms.domain.Document

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.