Package org.apache.lucene.index

Examples of org.apache.lucene.index.TermEnum


  public static TermStats[] getHighFreqTerms(IndexReader reader,
      int numTerms, String field) throws Exception {

    TermInfoWiTFQueue tiq = new TermInfoWiTFQueue(numTerms);
    if (field != null) {
      TermEnum terms = reader.terms(new Term(field));
      if (terms != null && terms.term() != null) {
        do {
          if (!terms.term().field().equals(field)) {
            break;
          }
          tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
        } while (terms.next());
      } else {
        System.out.println("No terms for field \"" + field + "\"");
      }
    } else {
      TermEnum terms = reader.terms();
      while (terms.next()) {
        tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
View Full Code Here


    // Separately count how many tokens are actually in the index:
    IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory());
    assertEquals(NUM_DOCS, reader.numDocs());

    TermEnum terms = reader.terms();
    TermDocs termDocs = reader.termDocs();
    int totalTokenCount2 = 0;
    while(terms.next()) {
      termDocs.seek(terms.term());
      while(termDocs.next())
        totalTokenCount2 += termDocs.freq();
    }
    reader.close();
View Full Code Here

      if (fields == null || fields.contains(fieldName)) {
        getTermsByFieldAndText().put(fieldName, new HashMap<String, InstantiatedTerm>(5000));
      }
    }
    List<InstantiatedTerm> terms = new ArrayList<InstantiatedTerm>(5000 * getTermsByFieldAndText().size());
    TermEnum termEnum = sourceIndexReader.terms();
    while (termEnum.next()) {
      if (fields == null || fields.contains(termEnum.term().field())) { // todo skipto if not using field
        InstantiatedTerm instantiatedTerm = new InstantiatedTerm(termEnum.term().field(), termEnum.term().text());
        getTermsByFieldAndText().get(termEnum.term().field()).put(termEnum.term().text(), instantiatedTerm);
        instantiatedTerm.setTermIndex(terms.size());
        terms.add(instantiatedTerm);
        instantiatedTerm.setAssociatedDocuments(new InstantiatedTermDocumentInformation[termEnum.docFreq()]);
      }
    }
    termEnum.close();
    orderedTerms = terms.toArray(new InstantiatedTerm[terms.size()]);

    // create term-document informations
    for (InstantiatedTerm term : orderedTerms) {
      TermPositions termPositions = sourceIndexReader.termPositions(term.getTerm());
View Full Code Here

            processedTerms.add(term);
                ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                float minScore=0;
                Term startTerm=internSavingTemplateTerm.createTerm(term);
                FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
                TermEnum origEnum = reader.terms(startTerm);
                int df=0;
                if(startTerm.equals(origEnum.term()))
                {
                    df=origEnum.docFreq(); //store the df so all variants use same idf
                }
                int numVariants=0;
                int totalVariantDocFreqs=0;
                do
                {
View Full Code Here

        System.out.println("FST stores ord");
      } else {
        System.out.println("FST stores docFreq");
      }
    }
    TermEnum termEnum = r.terms(new Term("body", ""));
    if (VERBOSE) {
      System.out.println("TEST: got termEnum=" + termEnum);
    }
    int ord = 0;
    while(true) {
      final Term term = termEnum.term();
      if (term == null || !"body".equals(term.field())) {
        break;
      }

      // No ord in 3.x:
      /*
      if (ord == 0) {
        try {
          termsEnum.ord();
        } catch (UnsupportedOperationException uoe) {
          if (VERBOSE) {
            System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
          }
          storeOrd = false;
        }
      }
      */
      final int output;
      if (storeOrd) {
        output = ord;
      } else {
        output = termEnum.docFreq();
      }
      //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0)));
      builder.add(toIntsRef(term.text()), outputs.get(output));
      ord++;
      if (ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
        System.out.println(ord + " terms...");
      }
      termEnum.next();
    }
    final FST<Long> fst = builder.finish();
    if (VERBOSE) {
      System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
    }

    if (ord > 0) {
      // Now confirm BytesRefFSTEnum and TermEnum act the
      // same:
      final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
      int num = atLeast(1000);
      for(int iter=0;iter<num;iter++) {
        final String randomTerm = getRandomString();

        if (VERBOSE) {
          System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
        }

        termEnum = r.terms(new Term("body", randomTerm));
        final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));

        if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
          assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
        } else {
          assertSame(termEnum, fstEnum, storeOrd);
          for(int nextIter=0;nextIter<10;nextIter++) {
            if (VERBOSE) {
              System.out.println("TEST: next");
              //if (storeOrd) {
              //System.out.println("  ord=" + termEnum.ord());
              //}
            }
            termEnum.next();
            if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
              if (VERBOSE) {
                System.out.println("  term=" + termEnum.term());
              }
              assertNotNull(fstEnum.next());
              assertSame(termEnum, fstEnum, storeOrd);
            } else {
              if (VERBOSE) {
View Full Code Here

   * Attempts to detect the given field type for an IndexReader.
   * @deprecated
   */
  static int detectFieldType(IndexReader reader, String fieldKey) throws IOException {
    String field = StringHelper.intern(fieldKey);
    TermEnum enumerator = reader.terms(new Term(field));
    try {
      Term term = enumerator.term();
      if (term == null) {
        throw new RuntimeException("no terms in field " + field + " - cannot determine sort type");
      }
      int ret = 0;
      if (term.field() == field) {
        String termtext = term.text().trim();

        try {
          Integer.parseInt (termtext);
          ret = SortField.INT;
        } catch (NumberFormatException nfe1) {
          try {
            Long.parseLong(termtext);
            ret = SortField.LONG;
          } catch (NumberFormatException nfe2) {
            try {
              Float.parseFloat (termtext);
              ret = SortField.FLOAT;
            } catch (NumberFormatException nfe3) {
              ret = SortField.STRING;
            }
          }
        }        
      } else {
        throw new RuntimeException("field \"" + field + "\" does not appear to be indexed");
      }
      return ret;
    } finally {
      enumerator.close();
    }
  }
View Full Code Here

        }
      }
      final int ix = i;
      final int jx = j;
 
      return new TermEnum() {
 
        private int i = ix; // index into info.sortedTerms
        private int j = jx; // index into sortedFields
         
        public boolean next() {
View Full Code Here

    protected Object createValue(IndexReader reader, Entry entryKey)
        throws IOException {
      String field = StringHelper.intern((String) entryKey.field);
      final String[] retArray = new String[reader.maxDoc()];
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms (new Term (field));
      try {
        do {
          Term term = termEnum.term();
          if (term==null || term.field() != field) break;
          String termval = term.text();
          termDocs.seek (termEnum);
          while (termDocs.next()) {
            retArray[termDocs.doc()] = termval;
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
      return retArray;
    }
View Full Code Here

        throws IOException {
      String field = StringHelper.intern((String) entryKey.field);
      final int[] retArray = new int[reader.maxDoc()];
      String[] mterms = new String[reader.maxDoc()+1];
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms (new Term (field));
      int t = 0// current term number

      // an entry for documents that have no terms in this field
      // should a document with no terms be at top or bottom?
      // this puts them at the top - if it is changed, FieldDocSortedHitQueue
      // needs to change as well.
      mterms[t++] = null;

      try {
        do {
          Term term = termEnum.term();
          if (term==null || term.field() != field) break;

          // store term text
          // we expect that there is at most one term per document
          if (t >= mterms.length) throw new RuntimeException ("there are more terms than " +
                  "documents in field \"" + field + "\", but it's impossible to sort on " +
                  "tokenized fields");
          mterms[t] = term.text();

          termDocs.seek (termEnum);
          while (termDocs.next()) {
            retArray[termDocs.doc()] = t;
          }

          t++;
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }

      if (t == 0) {
        // if there are no terms, make the term array
        // have a single null entry
View Full Code Here

    }

    protected Object createValue(IndexReader reader, Entry entryKey)
        throws IOException {
      String field = StringHelper.intern((String) entryKey.field);
      TermEnum enumerator = reader.terms (new Term (field));
      try {
        Term term = enumerator.term();
        if (term == null) {
          throw new RuntimeException ("no terms in field " + field + " - cannot determine type");
        }
        Object ret = null;
        if (term.field() == field) {
          String termtext = term.text().trim();

          try {
            Integer.parseInt (termtext);
            ret = wrapper.getInts (reader, field);
          } catch (NumberFormatException nfe1) {
            try {
              Long.parseLong(termtext);
              ret = wrapper.getLongs (reader, field);
            } catch (NumberFormatException nfe2) {
              try {
                Float.parseFloat (termtext);
                ret = wrapper.getFloats (reader, field);
              } catch (NumberFormatException nfe3) {
                ret = wrapper.getStringIndex (reader, field);
              }
            }
          }         
        } else {
          throw new RuntimeException ("field \"" + field + "\" does not appear to be indexed");
        }
        return ret;
      } finally {
        enumerator.close();
      }
    }
View Full Code Here

TOP

Related Classes of org.apache.lucene.index.TermEnum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.