Examples of EnglishStemmer


Examples of edu.ucla.sspace.text.EnglishStemmer

        public SemEvalHandler() {
            contexts = new ArrayList<Document>();
            inContext = false;
            context = new StringBuilder();
            stemmer = new EnglishStemmer();
        }
View Full Code Here

Examples of edu.ucla.sspace.text.EnglishStemmer

        if (args.length != 2) {
            System.out.println("Usage: StemTermList <in-term-list> <out-term>");
            System.exit(1);
        }

        Stemmer stemmer = new EnglishStemmer();

        PrintWriter writer = new PrintWriter(args[1]);

        BufferedReader br = new BufferedReader(new FileReader(args[0]));
        for (String line = null; (line=br.readLine()) != null; )
            writer.println(stemmer.stem(line.trim()));
        writer.close();
    }
View Full Code Here

Examples of org.tartarus.snowball.EnglishStemmer

   * @return the keyword list
   */
  public KeywordList extractKeyword(String string, boolean onlyNoun)
  {
    List<Keyword> ret = new ArrayList<Keyword>();
    EnglishStemmer engStemmer = new EnglishStemmer();
   
    try {
      List<MExpression> meList = leaveJustBest(postProcess(analyze(string)));

      Morpheme mp = null;
      MCandidate mc = null;
      MExpression me = null;
      Keyword keyword = null;
      List<Morpheme> mpList = new ArrayList<Morpheme>();
      for( int i = 0, size = meList == null ? 0 : meList.size(); i < size; i++ ) {
        me = meList.get(i);
        mc = me.get(0);

        int jSize = mc.size();
        if( jSize == 1 ) {
          mp = mc.get(0);
          mp.setString(me.getExp());
          mpList.add(mp);
        } else {
          // 분할되지 않은 리스트 형태로 형태소를 넣어준다.
          for( int j = 0; j < jSize; j++ )
            mpList.add(mc.get(j));
        }

      }

      // 복합 UOM 확인
      for( int endIdx = mpList.size() - 1; endIdx > 0; endIdx-- ) {
        for( int startIdx = Math.max(endIdx - MAX_UOM_SIZE, 0); startIdx < endIdx; startIdx++ ) {
          String tempName = "";
          for( int i = startIdx; i <= endIdx; i++ ) {
            tempName += mpList.get(i).getString();
          }

          // 다수의 토큰으로 이루어진 UOM 확인
          if( UOMDic.contains(tempName) ) {
            for( ; startIdx < endIdx; endIdx-- ) {
              mpList.remove(startIdx + 1);
            }
            mp = mpList.get(startIdx);
            mp.setString(tempName);
            mp.setCharSet(CharSetType.COMBINED);
            mp.setTag(POSTag.NNM);
          }
          // 다수의 토큰으로 이루어진 화학식 확인
          else if( ChemFormulaDic.contains(tempName) ) {
            for( ; startIdx < endIdx; endIdx-- ) {
              mpList.remove(startIdx + 1);
            }
            mp = mpList.get(startIdx);
            mp.setString(tempName);
            mp.setCharSet(CharSetType.COMBINED);
            mp.setTag(POSTag.UN);
          }
          // 다수의 토큰으로 이루어진 명사 확인 ((주), Web2.0)류의 키워드
          else if( CompNounDic.contains(tempName) ) {
            for( ; startIdx < endIdx; endIdx-- ) {
              mpList.remove(startIdx + 1);
            }
            if( !JunkWordDic.contains(tempName) ) {
              mp = mpList.get(startIdx);
              mp.setString(tempName);
              mp.setCharSet(CharSetType.COMBINED);
              mp.setTag(POSTag.NNG);
              mp.setComposed(true);
            }
          }
        }
      }

      // 키워드 추출
      for( int i = 0, size = mpList.size(); i < size; i++ ) {
        mp = mpList.get(i);
        mp.setString(mp.getString().toLowerCase());

        // stemming 및 키워드 추출
        if( (!onlyNoun || mp.isTagOf(POSTag.N) ) 
            && !JunkWordDic.contains(mp.getString()) )
        {

          // do stemming english word
          if( mp.isTagOf(POSTag.UN)
              && mp.getCharSet() == CharSetType.ENGLISH )
          {
            keyword = new Keyword(mp);
            engStemmer.setCurrent(keyword.getString().toLowerCase());
            engStemmer.stem();
            keyword.setString(engStemmer.getCurrent());
            ret.add(keyword);
          }
          // 사랑하 로 추출된 경우 명사 '사랑'을 색인어로 추출
          else if( mp.isTagOf(POSTag.V) ) {
            String temp = mp.getString();
View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

public class SnowballStemmerTest extends TamingTextTestJ4 {

  @Test
  public void test() throws NoSuchMethodException {
    //<start id="stemmer"/>
    EnglishStemmer english = new EnglishStemmer();

    String[] test = {"bank", "banks", "banking", "banker", "banked", "bankers"};//<co id="stemmer.co.test"/>
    String[] gold = {"bank", "bank", "bank", "banker", "bank", "banker"};//<co id="stemmer.co.gold"/>
    for (int i = 0; i < test.length; i++) {
      english.setCurrent(test[i]);//<co id="stemmer.co.set"/>
      english.stem();//<co id="stemmer.co.stem"/>
      System.out.println("English: " + english.getCurrent());
      assertTrue(english.getCurrent() + " is not equal to " + gold[i], english.getCurrent().equals(gold[i]) == true);
    }
    /*
<calloutlist>
<callout arearefs="stemmer.co.test"><para>Setup some tokens to be stemmed</para></callout>
<callout arearefs="stemmer.co.gold"><para>Define our expectations for results</para></callout>
View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

import java.util.Collections;

public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {

  public void test() throws IOException {
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
    String[] gold = new String[test.length];
    for (int i = 0; i < test.length; i++) {
      stemmer.setCurrent(test[i]);
      stemmer.stem();
      gold[i] = stemmer.getCurrent();
    }

    EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
    Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, gold);
  }

  public void testProtected() throws Exception {
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
    String[] gold = new String[test.length];
    for (int i = 0; i < test.length; i++) {
      if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
        stemmer.setCurrent(test[i]);
        stemmer.stem();
        gold[i] = stemmer.getCurrent();
      } else {
        gold[i] = test[i];
      }
    }
View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

import java.util.Collections;

public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {

  public void test() throws IOException {
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
    String[] gold = new String[test.length];
    for (int i = 0; i < test.length; i++) {
      stemmer.setCurrent(test[i]);
      stemmer.stem();
      gold[i] = stemmer.getCurrent();
    }

    SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
    Map<String, String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
    args.put("language", "English");
View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

  /**
   * Tests the protected words mechanism of EnglishPorterFilterFactory
   */
  @Deprecated
  public void testProtectedOld() throws Exception {
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
    String[] gold = new String[test.length];
    for (int i = 0; i < test.length; i++) {
      if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
        stemmer.setCurrent(test[i]);
        stemmer.stem();
        gold[i] = stemmer.getCurrent();
      } else {
        gold[i] = test[i];
      }
    }

View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

import java.util.Collections;

public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {

  public void test() throws IOException {
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
    StringBuilder gold = new StringBuilder();
    for (String aTest : test) {
      stemmer.setCurrent(aTest);
      stemmer.stem();
      gold.append(stemmer.getCurrent()).append(' ');
    }

    SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
    Map<String, String> args = new HashMap<String, String>();
    args.put("language", "English");
View Full Code Here

Examples of org.tartarus.snowball.ext.EnglishStemmer

    String out = tsToString(factory.create(new IterTokenStream(test)));
    assertEquals(gold.toString().trim(), out);
  }

  public void testProtected() throws Exception {
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
    StringBuilder gold = new StringBuilder();
    for (int i = 0; i < test.length; i++) {
      if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
        stemmer.setCurrent(test[i]);
        stemmer.stem();
        gold.append(stemmer.getCurrent()).append(' ');
      } else {
        gold.append(test[i]).append(' ');
      }
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.