/* Copyright (C) 2006 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import javax.swing.text.html.*;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import java.io.*;
/**
* This pipe removes HTML from a CharSequence. The HTML is actually parsed here,
* so we should have less HTML slipping through... but it is almost certainly
* much slower than a regular expression, and could fail on broken HTML.
*
* @author Greg Druck <a href="mailto:gdruck@cs.umass.edu">gdruck@cs.umass.edu</a>
*/
public class CharSequenceRemoveHTML extends Pipe {
public Instance pipe(Instance carrier) {
String text = ((CharSequence) carrier.getData()).toString();
// I take these out ahead of time because the
// Java HTML parser seems to die here.
text = text.replaceAll("\\<NOFRAMES\\>","");
text = text.replaceAll("\\<\\/NOFRAMES\\>","");
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
HTMLEditorKit.ParserCallback callback = new TagStripper();
try {
StringReader r = new StringReader(text);
parser.parse(r, callback, true);
} catch (IOException e) {
System.err.println(e);
}
String result = ((TagStripper) callback).getText();
carrier.setData((CharSequence) result);
return carrier;
}
private class TagStripper extends HTMLEditorKit.ParserCallback {
private String text;
public TagStripper() {
text = "";
}
public void handleText(char[] txt, int position) {
for (int index = 0; index < txt.length; index++) {
text += txt[index];
}
text += "\n";
}
public String getText() {
return text;
}
}
private class ParserGetter extends HTMLEditorKit {
// purely to make this method public
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}
public static void main(String[] args) {
String htmldir = args[0];
Pipe pipe = new SerialPipes(new Pipe[] { new Input2CharSequence(),
new CharSequenceRemoveHTML() });
InstanceList list = new InstanceList(pipe);
list.addThruPipe(new FileIterator(htmldir, FileIterator.STARTING_DIRECTORIES));
for (int index = 0; index < list.size(); index++) {
Instance inst = list.get(index);
System.err.println(inst.getData());
}
}
}