Package com.google.code.ftspc.lector.parsers.HTML

Source Code of com.google.code.ftspc.lector.parsers.HTML.HTMLParser

package com.google.code.ftspc.lector.parsers.HTML;

import com.google.code.ftspc.lector.indexers.AddDataToIndex;
import com.google.code.ftspc.lector.indexers.CommonFunctions;
import com.google.code.ftspc.lector.ini_and_vars.Vars;
import com.google.code.ftspc.lector.parsers.Parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import net.htmlparser.jericho.Source;

/**
* Class for the HTML parser
* @author Arthur Khusnutdinov
*/
public class HTMLParser extends CommonFunctions implements Parser {

    private String pathToFile;
    private String fileName;

    @Override
    public void run() {
        try {
            String fileContent = "";
            File fileForParsing;
            int length;

            String fileEnc = this.detectEncoding(pathToFile);

            fileForParsing = new File(pathToFile);
            length = (int) fileForParsing.length();

            if (length != 0) {
                Source source;
                AddDataToIndex AddDataToIndex = new AddDataToIndex(null);
                char[] cbuf = new char[length];
                InputStreamReader isr = new InputStreamReader(
                        new FileInputStream(fileForParsing), fileEnc);
                final int read = isr.read(cbuf);

                fileContent = new String(cbuf, 0, read);
                isr.close();

                if (!fileEnc.equals("UTF-8")) {
                    fileContent = (new String(fileContent.getBytes("UTF-8"), "UTF-8"));
                }

                source = new Source(fileContent);
                source.setLogger(null);
                fileContent = source.getTextExtractor().toString();

                AddDataToIndex.doAddData(fileContent, pathToFile, fileName);

                pathToFile = null;
                AddDataToIndex = null;
                fileContent = null;
                source = null;
                isr = null;
                fileForParsing = null;
            } else {
                pathToFile = null;
                fileContent = null;
                fileForParsing = null;
            }
            Vars.current_run_indexes--;

        } catch (Exception ex) {
            Vars.current_run_indexes--;
            Vars.logger.fatal("Error: ", ex);
        }
    }

    @Override
    public void start_th(String pathToFile, String fileName) {
        this.pathToFile = pathToFile;
        this.fileName = fileName;
        this.start();
    }
}
TOP

Related Classes of com.google.code.ftspc.lector.parsers.HTML.HTMLParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.