Source Code of cc.mallet.pipe.CharSequenceRemoveHTML$ParserGetter

/* Copyright (C) 2006 Univ. of Massachusetts Amherst, Computer Science Dept.
 This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
 http://www.cs.umass.edu/~mccallum/mallet 
 This software is provided under the terms of the Common Public License,
 version 1.0, as published by http://www.opensource.org.  For further
 information, see the file `LICENSE' included with this distribution. */


package cc.mallet.pipe;




import javax.swing.text.html.*;


import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;


import java.io.*;


/**
 * This pipe removes HTML from a CharSequence. The HTML is actually parsed here,
 * so we should have less HTML slipping through... but it is almost certainly
 * much slower than a regular expression, and could fail on broken HTML.
 * 
 * @author Greg Druck <a href="mailto:gdruck@cs.umass.edu">gdruck@cs.umass.edu</a>
 */


public class CharSequenceRemoveHTML extends Pipe {


  public Instance pipe(Instance carrier) {
    String text = ((CharSequence) carrier.getData()).toString();
    
    // I take these out ahead of time because the
    // Java HTML parser seems to die here.
    text = text.replaceAll("\\<NOFRAMES\\>","");
    text = text.replaceAll("\\<\\/NOFRAMES\\>","");
    
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    HTMLEditorKit.ParserCallback callback = new TagStripper();


    try {
      StringReader r = new StringReader(text);
      parser.parse(r, callback, true);
    } catch (IOException e) {
      System.err.println(e);
    }
    String result = ((TagStripper) callback).getText();
    carrier.setData((CharSequence) result);
    return carrier;
  }


  private class TagStripper extends HTMLEditorKit.ParserCallback {
    private String text;


    public TagStripper() {
      text = "";
    }


    public void handleText(char[] txt, int position) {
      for (int index = 0; index < txt.length; index++) {
        text += txt[index];
      }
      text += "\n";
    }


    public String getText() {
      return text;
    }


  }


  private class ParserGetter extends HTMLEditorKit {
    // purely to make this method public
    public HTMLEditorKit.Parser getParser() {
      return super.getParser();
    }
  }


  public static void main(String[] args) {
    String htmldir = args[0];
    Pipe pipe = new SerialPipes(new Pipe[] { new Input2CharSequence(),
        new CharSequenceRemoveHTML() });
    InstanceList list = new InstanceList(pipe);
    list.addThruPipe(new FileIterator(htmldir, FileIterator.STARTING_DIRECTORIES));


    for (int index = 0; index < list.size(); index++) {
      Instance inst = list.get(index);
      System.err.println(inst.getData());
    }


  }


}
Source Code of cc.mallet.pipe.CharSequenceRemoveHTML$ParserGetter

Related Classes of cc.mallet.pipe.CharSequenceRemoveHTML$ParserGetter