Package railo.runtime.search.lucene2.docs

Source Code of railo.runtime.search.lucene2.docs.HTMLDocument

package railo.runtime.search.lucene2.docs;

import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;

import railo.commons.io.IOUtil;
import railo.commons.io.res.Resource;
import railo.commons.lang.StringUtil;
import railo.runtime.op.Caster;
import railo.runtime.search.lucene2.html.HTMLParser;

/** A utility for making Lucene Documents for HTML documents. */

public final class HTMLDocument {
    private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0);
 
  public static String uid(Resource f) {
    return f.getPath().replace(FILE_SEPARATOR, '\u0000') +
      "\u0000" +
      DateField.timeToString(f.lastModified());
  }

  public static String uid2url(String uid) {
    String url = uid.replace('\u0000', '/');    // replace nulls with slashes
    return url.substring(0, url.lastIndexOf('/')); // remove date from end
  }
 
  public static Document getDocument(Resource res,String charset)  {
    Document doc = new Document();
    doc.add(FieldUtil.Text("uid", uid(res), false));
   
    HTMLParser parser = new HTMLParser();
    try {
      parser.parse(res,charset);
    }
    catch (Throwable t) {
        return doc;
    }
    addContent(doc,parser);
    return doc;
  }

  public static Document getDocument(StringBuffer content, Reader reader) {
      Document doc = new Document();
     
      HTMLParser parser = new HTMLParser();
      try {
        String str = IOUtil.toString(reader);
        if(content!=null)content.append(str);
        doc.add(FieldUtil.UnIndexed("size", Caster.toString(str.length())));
        StringReader sr = new StringReader(str);
          parser.parse(sr);
      }
      catch (Throwable t) {
        //t.printStackTrace();
          return doc;
      }
     
      addContent(doc, parser);
      return doc;
  }
 
    private static void addContent(Document doc, HTMLParser parser) {
     
      FieldUtil.setMimeType(doc,"text/html");
      //doc.add(FieldUtil.UnIndexed("mime-type", "text/html"));
     
      String content = parser.getContent();

      FieldUtil.setTitle(doc,parser.getTitle());
     
      String summary = parser.getSummary();
      if(StringUtil.isEmpty(summary)){
        summary=(content.length()<=200)? content:content.substring(0,200);
        FieldUtil.setSummary(doc,summary,false);
      }
      else{
        FieldUtil.setSummary(doc,summary,true);
      }
      FieldUtil.setRaw(doc,content);
      FieldUtil.setContent(doc,content);
     
      //doc.add(FieldUtil.UnIndexed("charset", StringUtil.valueOf(parser.getCharset())));
     
      if(parser.hasKeywords()) {
        FieldUtil.setKeywords(doc,parser.getKeywords());
      }
     

      if(parser.hasAuthor()){
        FieldUtil.setAuthor(doc,parser.getAuthor());
      }
      if(parser.hasCustom1()){
        FieldUtil.setCustom(doc,parser.getCustom1(),1);
      }
      if(parser.hasCustom2()){
        FieldUtil.setCustom(doc,parser.getCustom2(),2);
      }
      if(parser.hasCustom3()){
        FieldUtil.setCustom(doc,parser.getCustom3(),3);
      }
      if(parser.hasCustom4()){
        FieldUtil.setCustom(doc,parser.getCustom4(),4);
      }
     
     
   
}

  private HTMLDocument() {}
}
   
TOP

Related Classes of railo.runtime.search.lucene2.docs.HTMLDocument

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.