Source Code of games.stendhal.server.util.WikipediaAccess

/* $Id: WikipediaAccess.java,v 1.20 2010/11/28 17:00:45 martinfuchs Exp $ */
/***************************************************************************
 *                   (C) Copyright 2003-2010 - Stendhal                    *
 ***************************************************************************
 ***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/
package games.stendhal.server.util;


import games.stendhal.client.update.HttpClient;


import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;


import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


/**
 * Gets the first text paragraph from the specified Wikipedia article using the
 * MediaWiki bot API.
 * 
 * You can invoke the parser either inline using the method parse() or start it
 * in a new thread.
 * 
 * TODO: handle redirects (but take care, there might be two redirects that
 * point to each other).
 * 
 * @author hendrik
 */
public class WikipediaAccess extends DefaultHandler implements Runnable {


  private final String title;


  private final StringBuilder text = new StringBuilder();


  /** used by the parser to detect the right tag. */
  private boolean isContent;


  /** was the parsing completed. */
  private boolean finished;


  private String error;


  /**
   * Creates a new WikipeidaAccess.
   * 
   * @param title
   *            title of the page to access
   */
  public WikipediaAccess(final String title) {
    this.title = title;
  }


  @Override
  public void startElement(final String namespaceURI, final String lName, final String qName,
      final Attributes attrs) {
    isContent = qName.equals("rev");
  }


  @Override
  public void characters(final char[] ch, final int start, final int length) throws SAXException {
    if (isContent) {
      text.append(ch, start, length);
    }
  }


  /**
   * Returns the unparsed text.
   * 
   * @return content
   */
  public String getText() {
    return text.toString();
  }


  /**
   * Gets the last error message.
   * 
   * @return error message or <code>null</code> in case no error occurred
   */
  public String getError() {
    return error;
  }


  /**
   * Returns the first paragraph of the specified article without wiki code.
   * 
   * @return content
   */
  public String getProcessedText() {
    String content = getText();


    if (content != null) {
      // remove REDIRECT headers
      if (content.startsWith("#REDIRECT")) {
        content = content.replaceFirst(".*\n", "");
      }


      content = wikiToPlainText(content);
    }


    return content;
  }


  /**
   * Extract plain text from Wikipedia article content.
   * @param content
   * @return
   */
  private static String wikiToPlainText(String content) {
    // remove image links
    content = content.replaceAll("\\[\\[[iI]mage:[^\\]]*\\]\\]", "");
    // remove comments
    // (?s) means that . should also match newlines (DOTALL mode).
    content = content.replaceAll("(?s)<!--.*?-->", "");
    // remove ref
    content = content.replaceAll("(?s)<ref>.*?</ref>", "");


    // remove templates
    // first for two level deep templates
    content = content.replaceAll("(?s)\\{\\{([^{}]*?\\{\\{[^{}]*?\\}\\})+[^{}].*?\\}\\}", "");
    // then handle one level templates (This doesn't work with templates inside templates.)
    content = content.replaceAll("(?s)\\{\\{.*?\\}\\}", "");


    // remove tables
    // This doesn't work with templates inside templates.
    content = content.replaceAll("(?s)\\{\\|.*?\\|\\}", "");


    // remove complex links
    content = content.replaceAll("\\[\\[[^\\]]*\\|", "");
    // remove simple links
    content = content.replaceAll("\\[\\[", "");
    content = content.replaceAll("\\]\\]", "");
    // remove tags
    content = content.replaceAll("(?s)<.*?>", "");


    // ignore leading empty lines and spaces
    content = content.trim();


    // extract the first paragraph (ignoring very short ones but opposing a max len)
    final int size = content.length();
    int endOfFirstParagraph = content.indexOf("\n", 50);
    if (endOfFirstParagraph < 0) {
      endOfFirstParagraph = size;
    }
    content = content.substring(0, Math.min(endOfFirstParagraph, 1024));


    return content;
  }


  /**
   * Starts the parsing of the specified article.
   * 
   * @throws Exception
   *             in case of an unexpected error
   */
  private boolean parse() {
    String keyword = title;
    boolean success;


    try {
      while(keyword != null) {
        // look it up using the Wikipedia API
        final HttpClient httpClient = new HttpClient(
            "http://en.wikipedia.org/w/api.php?action=query&titles="
                + keyword.replace(' ', '_').replace("%", "%25")
                + "&prop=revisions&rvprop=content&format=xml");
        final SAXParserFactory factory = SAXParserFactory.newInstance();


        // Parse the input
        final SAXParser saxParser = factory.newSAXParser();
        saxParser.parse(httpClient.getInputStream(), this);


        final String response = getText();


        if (response.startsWith("#REDIRECT")) {
          // extract the new keyword
          final String redirect = wikiToPlainText(response).substring(9);


          // check for new line to detect if we got only a one liner to redirect
          if (redirect.indexOf('\n') > -1) {
            // We found the redirected article.
            keyword = null;
          } else {
            if (keyword.equalsIgnoreCase(redirect)) {
              // stop to avoid an infinite loop
              keyword = null;
            } else {
              reset();
              keyword = redirect;
            }
          }
        } else {
          // finished
          keyword = null;
        }
      }


      success = true;
    } catch (final Exception e) { // SAXException, IOException
      error = e.toString();
      success = false;
    } finally {
      finished = true;
    }


    return success;
  }


  /**
   * Reset internal state to repeat a query.
   */
  private void reset() {
    isContent = false;
    finished = false;
  }


  public void run() {
    parse();


    // ignore failures as they are already logged in the parse()-method itself
  }


  /**
   * Returns true when the XML response was completely parsed.
   * 
   * @return true if the parsing was completed, false otherwise
   */
  public boolean isFinished() {
    return finished;
  }
}
Source Code of games.stendhal.server.util.WikipediaAccess

Related Classes of games.stendhal.server.util.WikipediaAccess