Package org.jasen.net

Source Code of org.jasen.net.URLTokenizer

/*
* @(#)URLTokenizer.java  31/10/2004
*
* Copyright (c) 2004, 2005  jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*   1. Redistributions of source code must retain the above copyright notice,
*      this list of conditions and the following disclaimer.
*
*   2. Redistributions in binary form must reproduce the above copyright
*      notice, this list of conditions and the following disclaimer in
*      the documentation and/or other materials provided with the distribution.
*
*   3. The names of the authors may not be used to endorse or promote products
*      derived from this software without specific prior written permission.
*
*   4. Any modification or additions to the software must be contributed back
*      to the project.
*
*   5. Any investigation or reverse engineering of source code or binary to
*      enable emails to bypass the filters, and hence inflict spam and or viruses
*      onto users who use or do not use jASEN could subject the perpetrator to
*      criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.net;

import java.net.URL;

import org.jasen.core.parsers.StandardHTMLParser;
import org.jasen.error.JasenException;
import org.jasen.interfaces.URLReader;

/**
* <p>
* Tokenizes the content read from a url.
* </p>
*
* @author Jason Polites
*/
public class URLTokenizer {

  private URLReader urlReader;

  public String[] tokenize(URL url) throws JasenException {
    assertReaderAvailable();

    String html = urlReader.readURL(url);
    StandardHTMLParser htmlParser = new StandardHTMLParser();
    String text = htmlParser.extractText(html);
    String[] tokens = text.split(" ");
    return tokens;
  }

  /**
   * @return Returns the urlReader.
   */
  public URLReader getUrlReader() {
    return urlReader;
  }
  /**
   * @param urlReader The urlReader to set.
   */
  public void setUrlReader(URLReader urlReader) {
    this.urlReader = urlReader;
  }

  private void assertReaderAvailable() {
    if(urlReader == null) urlReader = new StandardURLReader();
  }
}
TOP

Related Classes of org.jasen.net.URLTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.