Examples of org.apache.lucene.demo.html.HTMLParser$JJCalls

org.apache.lucene.demo.html.HTMLParser

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), false, true, false));


    HTMLParser parser = new HTMLParser(f);


    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(Field.Text("contents", parser.getReader()));


    // Add the summary as an UnIndexed field, so that it is stored and returned
    // with hit documents for display.
    doc.add(Field.UnIndexed("summary", parser.getSummary()));


    // Add the title as a separate Text field, so that it can be searched
    // separately.
    doc.add(Field.Text("title", parser.getTitle()));


    // return the document
    return doc;
  }

View Full Code Here

    // 5. skip until end of doc header
    read("</DOCHDR>",null,false,false); 
    // 6. collect until end of doc
    sb = read("</DOC>",null,false,true);
    // this is the next document, so parse it  
    HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
    // title
    String title = p.getTitle();
    // properties 
    Properties props = p.getMetaTags(); 
    // body
    Reader r = p.getReader();
    char c[] = new char[1024];
    StringBuffer bodyBuf = new StringBuffer();
    int n;
    while ((n = r.read(c)) >= 0) {
      if (n>0) {

View Full Code Here

    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));


    FileInputStream fis = new FileInputStream(f);
    HTMLParser parser = new HTMLParser(fis);
      
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field("contents", parser.getReader()));


    // Add the summary as a field that is stored and returned with
    // hit documents for display.
    doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));


    // Add the title as a field that it can be searched and that is stored.
    doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));


    // return the document
    return doc;
  }

View Full Code Here

    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));


    FileInputStream fis = new FileInputStream(f);
    HTMLParser parser = new HTMLParser(fis);
      
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field("contents", parser.getReader()));


    // Add the summary as a field that is stored and returned with
    // hit documents for display.
    doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));


    // Add the title as a field that it can be searched and that is stored.
    doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));


    // return the document
    return doc;
  }

View Full Code Here

    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));


    FileInputStream fis = new FileInputStream(f);
    HTMLParser parser = new HTMLParser(fis);
      
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field("contents", parser.getReader()));


    // Add the summary as a field that is stored and returned with
    // hit documents for display.
    doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));


    // Add the title as a field that it can be searched and that is stored.
    doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED));


    // return the document
    return doc;
  }

View Full Code Here

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), false, true, false));


    HTMLParser parser = new HTMLParser(f);


    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(Field.Text("contents", parser.getReader()));


    // Add the summary as an UnIndexed field, so that it is stored and returned
    // with hit documents for display.
    doc.add(Field.UnIndexed("summary", parser.getSummary()));


    // Add the title as a separate Text field, so that it can be searched
    // separately.
    doc.add(Field.Text("title", parser.getTitle()));


    // return the document
    return doc;
  }

View Full Code Here

TOP

Related Classes of org.apache.lucene.demo.html.HTMLParser$JJCalls

org.apache.lucene.benchmark.byTask.feeds.TrecDocMaker

org.apache.lucene.demo.HTMLDocument

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.