Package org.languagetool.tokenizers.pl

Source Code of org.languagetool.tokenizers.pl.PolishWordTokenizerTest

/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
* USA
*/

package org.languagetool.tokenizers.pl;

import java.util.List;

import junit.framework.TestCase;

import org.languagetool.Language;
import org.languagetool.language.Polish;

public class PolishWordTokenizerTest extends TestCase {

  public void testTokenize() {
    final PolishWordTokenizer wordTokenizer = new PolishWordTokenizer();
    final List<String> tokens = wordTokenizer.tokenize("To jest\u00A0 test");
    assertEquals(tokens.size(), 6);
    assertEquals("[To,  , jest, \u00A0,  , test]", tokens.toString());
    final List<String> tokens2 = wordTokenizer.tokenize("To\rłamie");
    assertEquals(3, tokens2.size());
    assertEquals("[To, \r, łamie]", tokens2.toString());
    //hyphen with no whitespace
    final List<String> tokens3 = wordTokenizer.tokenize("A to jest-naprawdę-test!");
    assertEquals(tokens3.size(), 6);
    assertEquals("[A,  , to,  , jest-naprawdę-test, !]", tokens3.toString());
    //hyphen at the end of the word
    final List<String> tokens4 = wordTokenizer.tokenize("Niemiecko- i angielsko-polski");
    assertEquals(tokens4.size(), 6);
    assertEquals("[Niemiecko, -,  , i,  , angielsko-polski]", tokens4.toString());

    //hyphen probably instead of mdash
    final List<String> tokens5 = wordTokenizer.tokenize("Widzę krowę -i to dobrze!");
    assertEquals(11, tokens5.size());
    assertEquals("[Widzę,  , krowę,  , -, i,  , to,  , dobrze, !]", tokens5.toString());

    //mdash
    final List<String> tokens6 = wordTokenizer.tokenize("A to jest zdanie—rzeczywiście—z wtrąceniem.");
    assertEquals(tokens6.size(), 14);
    assertEquals("[A,  , to,  , jest,  , zdanie, —, rzeczywiście, —, z,  , wtrąceniem, .]", tokens6.toString());

    //compound words with hyphens
    final String compoundSentence = "To jest kobieta-wojownik w polsko-czeskim ubraniu, która wysłała dwa SMS-y.";
    List<String> compoundTokens = wordTokenizer.tokenize(compoundSentence);
    assertEquals(21, compoundTokens.size());
    assertEquals("[To,  , jest,  , kobieta-wojownik,  , w,  , polsko-czeskim,  , ubraniu, ,,  , która,  , wysłała,  , dwa,  , SMS-y, .]", compoundTokens.toString());
    //now setup the tagger...
    Language pl = new Polish();
    wordTokenizer.setTagger(pl.getTagger());
    compoundTokens = wordTokenizer.tokenize(compoundSentence);
    //we should get 4 more tokens: two hyphen tokens and two for the split words
    assertEquals(25, compoundTokens.size());
    assertEquals("[To,  , jest,  , kobieta, -, wojownik,  , " +
        "w,  , polsko, -, czeskim,  , ubraniu, ,,  " +
        ", która,  , wysłała,  , dwa,  , SMS-y, .]", compoundTokens.toString());
    compoundTokens = wordTokenizer.tokenize("Miała osiemnaście-dwadzieścia lat.");
    assertEquals(8, compoundTokens.size());
    assertEquals("[Miała,  , osiemnaście, -, dwadzieścia,  , lat, .]", compoundTokens.toString());
    // now three-part adja-adja-adj...:
    compoundTokens = wordTokenizer.tokenize("Słownik polsko-niemiecko-indonezyjski");
    assertEquals(7, compoundTokens.size());
    assertEquals("[Słownik,  , polsko, -, niemiecko, -, indonezyjski]", compoundTokens.toString());
  }

}
TOP

Related Classes of org.languagetool.tokenizers.pl.PolishWordTokenizerTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.