Package org.apache.lenya.lucene.html

Examples of org.apache.lenya.lucene.html.HTMLParser


     * @return DOCUMENT ME!
     *
     * @throws Exception DOCUMENT ME!
     */
    public String reTokenize(File file) throws Exception {
        TokenStream ts = new StandardAnalyzer().tokenStream(new HTMLParser(file).getReader());

        Token token = null;

        while ((token = ts.next()) != null) {
            System.out.println("ReTokenizeFile.reTokenize(File): " + token.termText() + " " +
View Full Code Here


     * @return the content of the file.
     * @throws FileNotFoundException if the file does not exists.
     * @throws IOException if something else went wrong.
     */
    protected String readHtmlFile(File file) throws FileNotFoundException, IOException {
        java.io.Reader reader = new HTMLParser(file).getReader();
        char[] chars = new char[1024];
        int chars_read;
        java.io.Writer writer = new java.io.StringWriter();

        while ((chars_read = reader.read(chars)) > 0) {
View Full Code Here

        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false));

        //HtmlDocument htmlDoc = new HtmlDocument(f);
        HTMLParser parser = new HTMLParser(f);

        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        // Add the title as a separate Text field, so that it can be searched separately.
        /*
                String title = htmlDoc.getTitle();

                if (title != null) {
                    doc.add(Field.Text("title", title));
                } else {
                    doc.add(Field.Text("title", ""));
                }
        */
        doc.add(Field.Text("title", parser.getTitle()));

        //System.out.println("HTMLDocument.getLuceneDocument(): title field added: " + title);
        // Add the tag-stripped contents as a Reader-valued Text field so it will get tokenized and indexed.
        /*
                String body = htmlDoc.getBody();
                String contents = "";

                if ((body != null) && (title != null)) {
                    contents = title + " " + body;
                    doc.add(Field.Text("contents", title + body));
                }

                doc.add(Field.Text("contents", contents));
        */
        doc.add(Field.Text("contents", parser.getReader()));

        return doc;
    }
View Full Code Here

     * @param file The file to retokenize
     * @return The path to the retokenized file
     * @throws Exception if an error occurs
     */
    public String reTokenize(File file) throws Exception {
        TokenStream ts = new StandardAnalyzer().tokenStream("",new HTMLParser(file).getReader());

        Token token = null;

        while ((token = ts.next()) != null) {
            System.out.println("ReTokenizeFile.reTokenize(File): " + token.termText() + " " +
View Full Code Here

     * @return the content of the file.
     * @throws FileNotFoundException if the file does not exists.
     * @throws IOException if something else went wrong.
     */
    protected String readHtmlFile(File file) throws FileNotFoundException, IOException {
        java.io.Reader reader = new HTMLParser(file).getReader();
        char[] chars = new char[1024];
        int chars_read;
        java.io.Writer writer = new java.io.StringWriter();

        while ((chars_read = reader.read(chars)) > 0) {
View Full Code Here

        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false));

        //HtmlDocument htmlDoc = new HtmlDocument(f);
        HTMLParser parser = new HTMLParser(f);

        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        // Add the title as a separate Text field, so that it can be searched separately.
        /*
                String title = htmlDoc.getTitle();

                if (title != null) {
                    doc.add(Field.Text("title", title));
                } else {
                    doc.add(Field.Text("title", ""));
                }
        */
        doc.add(Field.Text("title", parser.getTitle()));

        //System.out.println("HTMLDocument.getLuceneDocument(): title field added: " + title);
        // Add the tag-stripped contents as a Reader-valued Text field so it will get tokenized and indexed.
        /*
                String body = htmlDoc.getBody();
                String contents = "";

                if ((body != null) && (title != null)) {
                    contents = title + " " + body;
                    doc.add(Field.Text("contents", title + body));
                }

                doc.add(Field.Text("contents", contents));
        */
        doc.add(Field.Text("contents", parser.getReader()));

        return doc;
    }
View Full Code Here

     * @return DOCUMENT ME!
     *
     * @throws Exception DOCUMENT ME!
     */
    public String reTokenize(File file) throws Exception {
        TokenStream ts = new StandardAnalyzer().tokenStream(new HTMLParser(file).getReader());

        Token token = null;

        while ((token = ts.next()) != null) {
            System.out.println("ReTokenizeFile.reTokenize(File): " + token.termText() + " " +
View Full Code Here

        throws FileNotFoundException, IOException {
        if (file.getName().substring(file.getName().length() - 4).equals(".pdf")) {
            file = new File(file.getAbsolutePath() + ".txt");
        }

        java.io.Reader reader = new HTMLParser(file).getReader();
        char[] chars = new char[1024];
        int chars_read;
        java.io.Writer writer = new java.io.StringWriter();

        while ((chars_read = reader.read(chars)) > 0) {
View Full Code Here

        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false));

        //HtmlDocument htmlDoc = new HtmlDocument(f);
        HTMLParser parser = new HTMLParser(f);

        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        // Add the title as a separate Text field, so that it can be searched separately.
        /*
                String title = htmlDoc.getTitle();

                if (title != null) {
                    doc.add(Field.Text("title", title));
                } else {
                    doc.add(Field.Text("title", ""));
                }
        */
        doc.add(Field.Text("title", parser.getTitle()));

        //System.out.println("HTMLDocument.getLuceneDocument(): title field added: " + title);
        // Add the tag-stripped contents as a Reader-valued Text field so it will get tokenized and indexed.
        /*
                String body = htmlDoc.getBody();
                String contents = "";

                if ((body != null) && (title != null)) {
                    contents = title + " " + body;
                    doc.add(Field.Text("contents", title + body));
                }

                doc.add(Field.Text("contents", contents));
        */
        doc.add(Field.Text("contents", parser.getReader()));

        //System.out.println("HTMLDocument.getLuceneDocument(): contents field added: " + contents);
        return doc;
    }
View Full Code Here

     * @return DOCUMENT ME!
     *
     * @throws Exception DOCUMENT ME!
     */
    public String reTokenize(File file) throws Exception {
        TokenStream ts = new StandardAnalyzer().tokenStream(new HTMLParser(file).getReader());

        Token token = null;

        while ((token = ts.next()) != null) {
            System.out.println("ReTokenizeFile.reTokenize(File): " + token.termText() + " " +
View Full Code Here

TOP

Related Classes of org.apache.lenya.lucene.html.HTMLParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.