Package org.apache.lucene.analysis.wikipedia

Examples of org.apache.lucene.analysis.wikipedia.WikipediaTokenizer


* @version $Id$
*/
public class WikipediaTokenizerFactory extends BaseTokenizerFactory {
  // TODO: add support for WikipediaTokenizer's advanced options.
  public Tokenizer create(Reader input) {
    return new WikipediaTokenizer(input);
  }
View Full Code Here


public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
  protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";

  public void testSimple() throws Exception {
    String text = "This is a [[Category:foo]]";
    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(text));
    assertTokenStreamContents(tf,
        new String[] { "This", "is", "a", "foo" },
        new int[] { 0, 5, 8, 21 },
        new int[] { 4, 7, 9, 24 },
        new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY },
View Full Code Here

        + "''[[Category:ital_cat]]''  here is some that is ''italics [[Category:foo]] but is never closed."
        + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this"
        + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
        + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
   
    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
    assertTokenStreamContents(tf,
      new String[] {"link", "This", "is", "a",
        "foo", "Category", "This", "is", "a", "linked", "bar", "none",
        "withstanding", "Category", "This", "is", "parens", "This", "is", "a",
        "link", "This", "is", "an", "external", "URL",
View Full Code Here

        EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK, CITATION,
        "<ALPHANUM>", "<ALPHANUM>"});
  }

  public void testLinkPhrases() throws Exception {
    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES));
    checkLinkPhrases(tf);
  }
View Full Code Here

        new int[] { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1 });
  }

  public void testLinks() throws Exception {
    String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
    assertTokenStreamContents(tf,
        new String[] { "http://lucene.apache.org/java/docs/index.html#news", "here",
          "http://lucene.apache.org/java/docs/index.html?b=c", "here",
          "https://lucene.apache.org/java/docs/index.html?b=c", "here" },
        new String[] { EXTERNAL_LINK_URL, EXTERNAL_LINK,
View Full Code Here

  public void testLucene1133() throws Exception {
    Set<String> untoks = new HashSet<String>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    //should be exactly the same, regardless of untoks
    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES), WikipediaTokenizer.TOKENS_ONLY, untoks);
    checkLinkPhrases(tf);
    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
    assertTokenStreamContents(tf,
        new String[] { "a b c d", "e f g", "link", "here", "link",
          "there", "italics here", "something", "more italics", "h   i   j" },
        new int[] { 11, 32, 42, 47, 56, 61, 71, 86, 98, 124 },
        new int[] { 18, 37, 46, 51, 60, 66, 83, 95, 110, 133 },
View Full Code Here

    Set<String> untoks = new HashSet<String>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    //should output all the indivual tokens plus the untokenized tokens as well.  Untokenized tokens
    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
    assertTokenStreamContents(tf,
        new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g",
          "link", "here", "link", "there", "italics here", "italics", "here",
          "something", "more italics", "more", "italics", "h   i   j", "h", "i", "j" },
        new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 9898103, 124, 124, 128, 132 },
        new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 },
        new int[] { 101111011111110111,   0,   1,   1,   0,   1,   1 }
       );
   
    // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
    tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
    int expectedFlags[] = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0,
        0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
    FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
    tf.reset();
    for (int i = 0; i < expectedFlags.length; i++) {
      assertTrue(tf.incrementToken());
      assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
    }
    assertFalse(tf.incrementToken());
    tf.close();
  }
View Full Code Here

  public void testRandomStrings() throws Exception {
    Analyzer a = new Analyzer() {

      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new WikipediaTokenizer(reader);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
    };
    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
  }
View Full Code Here

    Random random = random();
    Analyzer a = new Analyzer() {

      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new WikipediaTokenizer(reader);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
    };
    checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
  }
View Full Code Here

    super(Version.LUCENE_31, stopSet);
  }
 
  @Override
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer tokenizer = new WikipediaTokenizer(reader);
    TokenStream result = new StandardFilter(Version.LUCENE_31, tokenizer);
    result = new LowerCaseFilter(Version.LUCENE_31, result);
    result = new StopFilter(Version.LUCENE_31, result, stopwords);
    return new TokenStreamComponents(tokenizer, result);
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.wikipedia.WikipediaTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.