package com.google.code.ftspc.lector.parsers.HTML;
import com.google.code.ftspc.lector.indexers.AddDataToIndex;
import com.google.code.ftspc.lector.indexers.CommonFunctions;
import com.google.code.ftspc.lector.ini_and_vars.Vars;
import com.google.code.ftspc.lector.parsers.Parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import net.htmlparser.jericho.Source;
/**
* Class for the HTML parser
* @author Arthur Khusnutdinov
*/
public class HTMLParser extends CommonFunctions implements Parser {
private String pathToFile;
private String fileName;
@Override
public void run() {
try {
String fileContent = "";
File fileForParsing;
int length;
String fileEnc = this.detectEncoding(pathToFile);
fileForParsing = new File(pathToFile);
length = (int) fileForParsing.length();
if (length != 0) {
Source source;
AddDataToIndex AddDataToIndex = new AddDataToIndex(null);
char[] cbuf = new char[length];
InputStreamReader isr = new InputStreamReader(
new FileInputStream(fileForParsing), fileEnc);
final int read = isr.read(cbuf);
fileContent = new String(cbuf, 0, read);
isr.close();
if (!fileEnc.equals("UTF-8")) {
fileContent = (new String(fileContent.getBytes("UTF-8"), "UTF-8"));
}
source = new Source(fileContent);
source.setLogger(null);
fileContent = source.getTextExtractor().toString();
AddDataToIndex.doAddData(fileContent, pathToFile, fileName);
pathToFile = null;
AddDataToIndex = null;
fileContent = null;
source = null;
isr = null;
fileForParsing = null;
} else {
pathToFile = null;
fileContent = null;
fileForParsing = null;
}
Vars.current_run_indexes--;
} catch (Exception ex) {
Vars.current_run_indexes--;
Vars.logger.fatal("Error: ", ex);
}
}
@Override
public void start_th(String pathToFile, String fileName) {
this.pathToFile = pathToFile;
this.fileName = fileName;
this.start();
}
}