Package org.terrier.indexing

Examples of org.terrier.indexing.TRECFullTokenizer


      if (! Files.exists(queryfilename) || ! Files.canRead(queryfilename)) {
        logger.error("The topics file " + queryfilename + " does not exist, or it cannot be read.");
        return false;
      } else {
        br = Files.openFileReader(queryfilename,desiredEncoding);
        TRECFullTokenizer queryTokenizer = new TRECFullTokenizer(
              new TagSet(TagSet.TREC_QUERY_TAGS),
              new TagSet(TagSet.EMPTY_TAGS),
              br);
        queryTokenizer.setIgnoreMissingClosingTags(true);
        while (!queryTokenizer.isEndOfFile()) {
          String docnoToken = null;
          StringBuilder query = new StringBuilder();
          boolean seenDescriptionToken = ! IGNORE_DESC_NARR_NAME_TOKENS;
          boolean seenNarrativeToken = ! IGNORE_DESC_NARR_NAME_TOKENS;
          while (!queryTokenizer.isEndOfDocument()) {
            String token = queryTokenizer.nextToken();
            if (token == null
                || token.length() == 0
                || queryTokenizer.inTagToSkip())
              continue;
           
            if (queryTokenizer.inDocnoTag()) {
              //The tokenizer is constructed from the trimmed version of the contents
              //of the query number tag, so that the last token extracted from it, is
              //always the query number, and not an empty string
              StringTokenizer docnoTokens =
                new StringTokenizer(token.trim(), " ");
              while (docnoTokens.hasMoreTokens())
                docnoToken = docnoTokens.nextToken().trim();
            } else if (queryTokenizer.inTagToProcess()) {
              // Removed the code that checks if "description" and
              // "narrative" appear in "desc" and "narr", respective.
              // THIS WILL HURT THE RETRIEVAL PERFORMANCE. Therefore,
              // it is recommended to add these words in the stopword
              // list.
              if (!seenDescriptionToken && queryTokenizer
                .currentTag()
                .toUpperCase()
                 .equals("DESC")
                 && token.toUpperCase().equals("DESCRIPTION"))
                 continue;
                if (!seenNarrativeToken && queryTokenizer
                 .currentTag()
                 .toUpperCase()
                 .equals("NARR")
                 && token.toUpperCase().equals("NARRATIVE"))
                 continue
              query.append(token);
              query.append(' ');
             
            }
          }
          queryTokenizer.nextDocument();
          if (query.length() == 0)
            continue;
          vecStringQueries.add(query.toString().trim());
          vecStringIds.add(docnoToken.trim());
         
View Full Code Here

TOP

Related Classes of org.terrier.indexing.TRECFullTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.