Package games.stendhal.server.util

Source Code of games.stendhal.server.util.WikipediaAccess

/* $Id: WikipediaAccess.java,v 1.20 2010/11/28 17:00:45 martinfuchs Exp $ */
/***************************************************************************
*                   (C) Copyright 2003-2010 - Stendhal                    *
***************************************************************************
***************************************************************************
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
***************************************************************************/
package games.stendhal.server.util;

import games.stendhal.client.update.HttpClient;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
* Gets the first text paragraph from the specified Wikipedia article using the
* MediaWiki bot API.
*
* You can invoke the parser either inline using the method parse() or start it
* in a new thread.
*
* TODO: handle redirects (but take care, there might be two redirects that
* point to each other).
*
* @author hendrik
*/
public class WikipediaAccess extends DefaultHandler implements Runnable {

  private final String title;

  private final StringBuilder text = new StringBuilder();

  /** used by the parser to detect the right tag. */
  private boolean isContent;

  /** was the parsing completed. */
  private boolean finished;

  private String error;

  /**
   * Creates a new WikipeidaAccess.
   *
   * @param title
   *            title of the page to access
   */
  public WikipediaAccess(final String title) {
    this.title = title;
  }

  @Override
  public void startElement(final String namespaceURI, final String lName, final String qName,
      final Attributes attrs) {
    isContent = qName.equals("rev");
  }

  @Override
  public void characters(final char[] ch, final int start, final int length) throws SAXException {
    if (isContent) {
      text.append(ch, start, length);
    }
  }

  /**
   * Returns the unparsed text.
   *
   * @return content
   */
  public String getText() {
    return text.toString();
  }

  /**
   * Gets the last error message.
   *
   * @return error message or <code>null</code> in case no error occurred
   */
  public String getError() {
    return error;
  }

  /**
   * Returns the first paragraph of the specified article without wiki code.
   *
   * @return content
   */
  public String getProcessedText() {
    String content = getText();

    if (content != null) {
      // remove REDIRECT headers
      if (content.startsWith("#REDIRECT")) {
        content = content.replaceFirst(".*\n", "");
      }

      content = wikiToPlainText(content);
    }

    return content;
  }

  /**
   * Extract plain text from Wikipedia article content.
   * @param content
   * @return
   */
  private static String wikiToPlainText(String content) {
    // remove image links
    content = content.replaceAll("\\[\\[[iI]mage:[^\\]]*\\]\\]", "");
    // remove comments
    // (?s) means that . should also match newlines (DOTALL mode).
    content = content.replaceAll("(?s)<!--.*?-->", "");
    // remove ref
    content = content.replaceAll("(?s)<ref>.*?</ref>", "");

    // remove templates
    // first for two level deep templates
    content = content.replaceAll("(?s)\\{\\{([^{}]*?\\{\\{[^{}]*?\\}\\})+[^{}].*?\\}\\}", "");
    // then handle one level templates (This doesn't work with templates inside templates.)
    content = content.replaceAll("(?s)\\{\\{.*?\\}\\}", "");

    // remove tables
    // This doesn't work with templates inside templates.
    content = content.replaceAll("(?s)\\{\\|.*?\\|\\}", "");

    // remove complex links
    content = content.replaceAll("\\[\\[[^\\]]*\\|", "");
    // remove simple links
    content = content.replaceAll("\\[\\[", "");
    content = content.replaceAll("\\]\\]", "");
    // remove tags
    content = content.replaceAll("(?s)<.*?>", "");

    // ignore leading empty lines and spaces
    content = content.trim();

    // extract the first paragraph (ignoring very short ones but opposing a max len)
    final int size = content.length();
    int endOfFirstParagraph = content.indexOf("\n", 50);
    if (endOfFirstParagraph < 0) {
      endOfFirstParagraph = size;
    }
    content = content.substring(0, Math.min(endOfFirstParagraph, 1024));

    return content;
  }

  /**
   * Starts the parsing of the specified article.
   *
   * @throws Exception
   *             in case of an unexpected error
   */
  private boolean parse() {
    String keyword = title;
    boolean success;

    try {
      while(keyword != null) {
        // look it up using the Wikipedia API
        final HttpClient httpClient = new HttpClient(
            "http://en.wikipedia.org/w/api.php?action=query&titles="
                + keyword.replace(' ', '_').replace("%", "%25")
                + "&prop=revisions&rvprop=content&format=xml");
        final SAXParserFactory factory = SAXParserFactory.newInstance();

        // Parse the input
        final SAXParser saxParser = factory.newSAXParser();
        saxParser.parse(httpClient.getInputStream(), this);

        final String response = getText();

        if (response.startsWith("#REDIRECT")) {
          // extract the new keyword
          final String redirect = wikiToPlainText(response).substring(9);

          // check for new line to detect if we got only a one liner to redirect
          if (redirect.indexOf('\n') > -1) {
            // We found the redirected article.
            keyword = null;
          } else {
            if (keyword.equalsIgnoreCase(redirect)) {
              // stop to avoid an infinite loop
              keyword = null;
            } else {
              reset();
              keyword = redirect;
            }
          }
        } else {
          // finished
          keyword = null;
        }
      }

      success = true;
    } catch (final Exception e) { // SAXException, IOException
      error = e.toString();
      success = false;
    } finally {
      finished = true;
    }

    return success;
  }

  /**
   * Reset internal state to repeat a query.
   */
  private void reset() {
    isContent = false;
    finished = false;
  }

  public void run() {
    parse();

    // ignore failures as they are already logged in the parse()-method itself
  }

  /**
   * Returns true when the XML response was completely parsed.
   *
   * @return true if the parsing was completed, false otherwise
   */
  public boolean isFinished() {
    return finished;
  }
}
TOP

Related Classes of games.stendhal.server.util.WikipediaAccess

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.