Examples of WordTokenizer


Examples of com.dci.intellij.dbn.common.util.WordTokenizer

    public MySqlArgumentsResultSet(ResultSet resultSet) throws SQLException {
        List<Argument> argumentList = new ArrayList<Argument>();
        while (resultSet.next()) {
            String argumentsString = resultSet.getString("ARGUMENTS");
            WordTokenizer wordTokenizer = new WordTokenizer(argumentsString);

            String methodName = resultSet.getString("METHOD_NAME");
            String methodType = resultSet.getString("METHOD_TYPE");
            boolean betweenBrackets = false;
            boolean typePostfixSet = false;
            int argumentPosition = methodType.equals("FUNCTION") ? 0 : 1;

            Argument argument = null;

            for (String token : wordTokenizer.getTokens()) {
                if (argument == null) {
                    typePostfixSet = false;
                    argument = new Argument();
                    argument.methodName = methodName;
                    argument.methodType = methodType;
View Full Code Here

Examples of edu.harvard.wcfia.yoshikoder.document.tokenizer.WordTokenizer

     * Tokenizes the document using a WordTokenizer.  This is not the final version.
     *
     */
    protected void tokenize() throws IOException {
        String txt = getText();
        WordTokenizer tok = new WordTokenizer(getLocale());
        tokenSpans = tok.getTokenSpans(txt);
    }
View Full Code Here

Examples of net.yacy.document.WordTokenizer

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<String> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
View Full Code Here

Examples of net.yacy.document.WordTokenizer

      final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
      final StringBuilder buffer = new StringBuilder(bufferSize);
      buffer.append(document.dc_title());
      buffer.append(document.dc_description());
      buffer.append(document.dc_subject(' '));
      final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);

      int count = 0;

      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {
        count = 0;
        token = tokens.nextElement();

        /*
        pair.delete(0, pair.indexOf(SPACE)+1);
        if(pair.length() > 1)
          pair.append(SPACE);
View Full Code Here

Examples of net.yacy.document.WordTokenizer

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<StringBuilder> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
View Full Code Here

Examples of net.yacy.document.WordTokenizer

      final StringBuilder buffer = new StringBuilder(bufferSize);
      final StringBuilder pwords = new StringBuilder(1000);
      buffer.append(document.dc_title().toLowerCase());
      buffer.append(document.dc_description().toLowerCase());
      buffer.append(document.dc_subject(' ').toLowerCase());
      final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
      int score = 0;
     
      // get phrases
      final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
      phrases.putAll(getPhrases(document, 3));
      final Iterator<String> iter = phrases.keySet().iterator();
      while(iter.hasNext()) {
        score = 10;
        final String phrase = iter.next();             
        if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
          score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
        }
        if(isDigitSpace(phrase)) {
          score = 10;
        }
        if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {         
          score = score * 10;
        }
        if (tags.containsKey(phrase)) {
          score = score * 20;
        }
        topwords.add(new YMarkTag(phrase, score));
        pwords.append(phrase);
        pwords.append(' ');
      }
     
      // loop through potential tag and rank them
      while(tokens.hasMoreElements()) {       
        score = 0;
        token = tokens.nextElement();
       
        // check if the token appears in the text
        if (words.containsKey(token.toString())) {         
          final Word word = words.get(token.toString());
          // token appears in text and matches an existing bookmark tag
View Full Code Here

Examples of net.yacy.document.WordTokenizer

 
 
  private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
    final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
    final StringBuilder phrase = new StringBuilder(128);
    final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
    StringBuilder token;
    int count = 0;
   
    // loop through text
    while(tokens.hasMoreElements()) {       

      token = tokens.nextElement();     
      if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
        continue;     
     
      // if we have a full phrase, delete the first token
      count++;
View Full Code Here

Examples of org.apache.lucene.analysis.cn.smart.WordTokenizer

    wordSegment = new WordSegmenter();
  }

  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new SentenceTokenizer(reader);
    result = new WordTokenizer(result, wordSegment);
    // result = new LowerCaseFilter(result);
    // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
    // stem太严格了, This is not bug, this feature:)
    result = new PorterStemFilter(result);
    if (stopWords != null) {
View Full Code Here

Examples of org.languagetool.tokenizers.WordTokenizer

  private DemoDisambiguator disamb2;
 
  @Override
  public void setUp() {
    tagger = new EnglishTagger();
    tokenizer = new WordTokenizer();
    sentenceTokenizer = new SentenceTokenizer();
    disambiguator = new EnglishRuleDisambiguator();
    disamb2 = new DemoDisambiguator();
  }
View Full Code Here

Examples of org.languagetool.tokenizers.WordTokenizer

  private WordTokenizer tokenizer;
     
  @Override
  public void setUp() {
    tagger = new DutchTagger();
    tokenizer = new WordTokenizer();
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.