/* Copyright (C) 2006 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import javax.swing.text.html.*;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import java.io.*;
* This pipe removes HTML from a CharSequence. The HTML is actually parsed here,
* so we should have less HTML slipping through... but it is almost certainly
* much slower than a regular expression, and could fail on broken HTML.
* @author Greg Druck <a href="mailto:gdruck@cs.umass.edu">gdruck@cs.umass.edu</a>
public class CharSequenceRemoveHTML extends Pipe {
public Instance pipe(Instance carrier) {
String text = ((CharSequence) carrier.getData()).toString();
// I take these out ahead of time because the
// Java HTML parser seems to die here.
text = text.replaceAll("\\<NOFRAMES\\>","");
text = text.replaceAll("\\<\\/NOFRAMES\\>","");
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
HTMLEditorKit.ParserCallback callback = new TagStripper();
try {
StringReader r = new StringReader(text);
parser.parse(r, callback, true);
} catch (IOException e) {
String result = ((TagStripper) callback).getText();
carrier.setData((CharSequence) result);
return carrier;
private class TagStripper extends HTMLEditorKit.ParserCallback {
private String text;
public TagStripper() {
text = "";
public void handleText(char[] txt, int position) {
for (int index = 0; index < txt.length; index++) {
text += txt[index];
text += "\n";
public String getText() {
return text;
private class ParserGetter extends HTMLEditorKit {
// purely to make this method public
public HTMLEditorKit.Parser getParser() {
return super.getParser();
public static void main(String[] args) {
String htmldir = args[0];
Pipe pipe = new SerialPipes(new Pipe[] { new Input2CharSequence(),
new CharSequenceRemoveHTML() });
InstanceList list = new InstanceList(pipe);
list.addThruPipe(new FileIterator(htmldir, FileIterator.STARTING_DIRECTORIES));
for (int index = 0; index < list.size(); index++) {
Instance inst = list.get(index);