Package org.apache.solr.schema

Examples of org.apache.solr.schema.FieldType$DefaultAnalyzer


    try {

      // Get the field's analyzer
      if (fieldTypeName != null
              && schema.getFieldTypeNoEx(fieldTypeName) != null) {
        FieldType fieldType = schema.getFieldTypes()
                .get(fieldTypeName);
        // Do index-time analysis using the given fieldType's analyzer
        RAMDirectory ramDir = new RAMDirectory();
        IndexWriter writer = new IndexWriter(ramDir, fieldType.getAnalyzer(),
                true, IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setMergeFactor(300);
        writer.setMaxBufferedDocs(150);

        List<String> lines = loader.getLines(sourceLocation, characterEncoding);
View Full Code Here


            SolrInputDocument doc = readDoc(parser);
            SchemaField uniq = schema.getUniqueKeyField();
            NamedList<NamedList<NamedList<Object>>> theTokens = new SimpleOrderedMap<NamedList<NamedList<Object>>>();
            result.add(doc.getFieldValue(uniq.getName()).toString(), theTokens);
            for (String name : doc.getFieldNames()) {
              FieldType ft = schema.getFieldType(name);
              Analyzer analyzer = ft.getAnalyzer();
              Collection<Object> vals = doc.getFieldValues(name);
              for (Object val : vals) {
                Reader reader = new StringReader(val.toString());
                TokenStream tstream = analyzer.tokenStream(name, reader);
                NamedList<NamedList<Object>> tokens = getTokens(tstream);
View Full Code Here

    }

    BufferedReader r = new BufferedReader(new InputStreamReader(is));

    String idName = ffs.keyField.getName().intern();
    FieldType idType = ffs.keyField.getType();
    boolean sorted=true;   // assume sorted until we discover it's not


    // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next()
    // because of this, simply ask the reader for a new termEnum rather than
    // trying to use skipTo()

    List<String> notFound = new ArrayList<String>();
    int notFoundCount=0;
    int otherErrors=0;

    TermDocs termDocs = null;
    Term protoTerm = new Term(idName, "");
    TermEnum termEnum = null;
    // Number of times to try termEnum.next() before resorting to skip
    int numTimesNext = 10;

    char delimiter='=';
    String termVal;
    boolean hasNext=true;
    String prevKey="";

    String lastVal="\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF";

    try {
      termDocs = reader.termDocs();
      termEnum = reader.terms(protoTerm);
      Term t = termEnum.term();
      if (t != null && t.field() == idName) { // intern'd comparison
        termVal = t.text();
      } else {
        termVal = lastVal;
      }


      for (String line; (line=r.readLine())!=null;) {
        int delimIndex = line.indexOf(delimiter);
        if (delimIndex < 0) continue;

        int endIndex = line.length();
        /* EOLs should already be removed for BufferedReader.readLine()
        for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) {
          char ch = line.charAt(endIndex-1);
          if (ch!='\n' && ch!='\r') break;
        }
        */
        String key = line.substring(0, delimIndex);
        String val = line.substring(delimIndex+1, endIndex);

        String internalKey = idType.toInternal(key);
        float fval;
        try {
          fval=Float.parseFloat(val);
        } catch (Exception e) {
          if (++otherErrors<=10) {
View Full Code Here


  public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException {
    use.incrementAndGet();

    FieldType ft = searcher.getSchema().getFieldType(field);

    NamedList res = new NamedList()// order is important

    DocSet docs = baseDocs;
    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();

    if (baseSize >= mincount) {

      final int[] index = this.index;
      final int[] counts = new int[numTermsInField];

      //
      // If there is prefix, find it's start and end term numbers
      //
      int startTerm = 0;
      int endTerm = numTermsInField;  // one past the end

      NumberedTermEnum te = ti.getEnumerator(searcher.getReader());
      if (prefix != null && prefix.length() > 0) {
        te.skipTo(prefix);
        startTerm = te.getTermNumber();
        te.skipTo(prefix + "\uffff\uffff\uffff\uffff");
        endTerm = te.getTermNumber();
      }

      /***********
      // Alternative 2: get the docSet of the prefix (could take a while) and
      // then do the intersection with the baseDocSet first.
      if (prefix != null && prefix.length() > 0) {
        docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs);
        // The issue with this method are problems of returning 0 counts for terms w/o
        // the prefix.  We can't just filter out those terms later because it may
        // mean that we didn't collect enough terms in the queue (in the sorted case).
      }
      ***********/

      boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0
              && startTerm==0 && endTerm==numTermsInField
              && docs instanceof BitDocSet;

      if (doNegative) {
        OpenBitSet bs = (OpenBitSet)((BitDocSet)docs).getBits().clone();
        bs.flip(0, maxDoc);
        // TODO: when iterator across negative elements is available, use that
        // instead of creating a new bitset and inverting.
        docs = new BitDocSet(bs, maxDoc - baseSize);
        // simply negating will mean that we have deleted docs in the set.
        // that should be OK, as their entries in our table should be empty.
      }

      // For the biggest terms, do straight set intersections
      for (TopTerm tt : bigTerms.values()) {
        // TODO: counts could be deferred if sorted==false
        if (tt.termNum >= startTerm && tt.termNum < endTerm) {
          counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
        }
      }

      // TODO: we could short-circuit counting altogether for sorted faceting
      // where we already have enough terms from the bigTerms

      // TODO: we could shrink the size of the collection array, and
      // additionally break when the termNumber got above endTerm, but
      // it would require two extra conditionals in the inner loop (although
      // they would be predictable for the non-prefix case).
      // Perhaps a different copy of the code would be warranted.

      if (termInstances > 0) {
        DocIterator iter = docs.iterator();
        while (iter.hasNext()) {
          int doc = iter.nextDoc();
          int code = index[doc];

          if ((code & 0xff)==1) {
            int pos = code>>>8;
            int whichArray = (doc >>> 16) & 0xff;
            byte[] arr = tnums[whichArray];
            int tnum = 0;
            for(;;) {
              int delta = 0;
              for(;;) {
                byte b = arr[pos++];
                delta = (delta << 7) | (b & 0x7f);
                if ((b & 0x80) == 0) break;
              }
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              counts[tnum]++;
            }
          } else {
            int tnum = 0;
            int delta = 0;
            for (;;) {
              delta = (delta << 7) | (code & 0x7f);
              if ((code & 0x80)==0) {
                if (delta==0) break;
                tnum += delta - TNUM_OFFSET;
                counts[tnum]++;
                delta = 0;
              }
              code >>>= 8;
            }
          }
        }
      }

      int off=offset;
      int lim=limit>=0 ? limit : Integer.MAX_VALUE;

      if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
        int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
        maxsize = Math.min(maxsize, numTermsInField);
        final BoundedTreeSet<Long> queue = new BoundedTreeSet<Long>(maxsize);
        int min=mincount-1// the smallest value in the top 'N' values
        for (int i=startTerm; i<endTerm; i++) {
          int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
          if (c>min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).

            // minimize object creation and speed comparison by creating a long that
            // encompases both count and term number.
            // Since smaller values are kept in the TreeSet, make higher counts smaller.
            //
            //   for equal counts, lower term numbers
            // should come first and hence be "greater"

            //long pair = (((long)c)<<32) | (0x7fffffff-i) ;   // use if priority queue
            long pair = (((long)-c)<<32) | i;
            queue.add(new Long(pair));
            if (queue.size()>=maxsize) min=-(int)(queue.last().longValue() >>> 32);
          }
        }
        // now select the right page from the results
        for (Long p : queue) {
          if (--off>=0) continue;
          if (--lim<0) break;
          int c = -(int)(p.longValue() >>> 32);
          //int tnum = 0x7fffffff - (int)p.longValue();  // use if priority queue
          int tnum = (int)p.longValue();
          String label = ft.indexedToReadable(getTermText(te, tnum));
          res.add(label, c);
        }
      } else {
        // add results in index order
        int i=startTerm;
        if (mincount<=0) {
          // if mincount<=0, then we won't discard any terms and we know exactly
          // where to start.
          i=startTerm+off;
          off=0;
        }

        for (; i<endTerm; i++) {
          int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
          if (c<mincount || --off>=0) continue;
          if (--lim<0) break;

          String label = ft.indexedToReadable(getTermText(te, i));
          res.add(label, c);
        }
      }

      te.close();
View Full Code Here

    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();

    if (baseSize <= 0) return allstats;

    FieldType ft = searcher.getSchema().getFieldType(field);

    DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(field, null, null, false, false)) );

    int i = 0;
    final FieldFacetStats[] finfo = new FieldFacetStats[facet.length];
    //Initialize facetstats, if facets have been passed in
    FieldCache.StringIndex si;
    for (String f : facet) {
      FieldType facet_ft = searcher.getSchema().getFieldType(f);
      try {
        si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), f);
      }
      catch (IOException e) {
        throw new RuntimeException("failed to open field cache for: " + f, e);
      }
      finfo[i] = new FieldFacetStats(f, si, facet_ft, numTermsInField);
      i++;
    }

    final int[] index = this.index;
    final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset

    NumberedTermEnum te = ti.getEnumerator(searcher.getReader());


    boolean doNegative = false;
    if (finfo.length == 0) {
      //if we're collecting statistics with a facet field, can't do inverted counting
      doNegative = baseSize > maxDoc >> 1 && termInstances > 0
              && docs instanceof BitDocSet;
    }

    if (doNegative) {
      OpenBitSet bs = (OpenBitSet) ((BitDocSet) docs).getBits().clone();
      bs.flip(0, maxDoc);
      // TODO: when iterator across negative elements is available, use that
      // instead of creating a new bitset and inverting.
      docs = new BitDocSet(bs, maxDoc - baseSize);
      // simply negating will mean that we have deleted docs in the set.
      // that should be OK, as their entries in our table should be empty.
    }

    // For the biggest terms, do straight set intersections
    for (TopTerm tt : bigTerms.values()) {
      // TODO: counts could be deferred if sorted==false
      if (tt.termNum >= 0 && tt.termNum < numTermsInField) {
        if (finfo.length == 0) {
          counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
        } else {
          //COULD BE VERY SLOW
          //if we're collecting stats for facet fields, we need to iterate on all matching documents
          DocSet bigTermDocSet = searcher.getDocSet(new TermQuery(tt.term)).intersection(docs);
          DocIterator iter = bigTermDocSet.iterator();
          while (iter.hasNext()) {
            int doc = iter.nextDoc();
            counts[tt.termNum]++;
            for (FieldFacetStats f : finfo) {
              f.facetTermNum(doc, tt.termNum);
            }
          }
        }
      }
    }


    if (termInstances > 0) {
      DocIterator iter = docs.iterator();
      while (iter.hasNext()) {
        int doc = iter.nextDoc();
        int code = index[doc];

        if ((code & 0xff) == 1) {
          int pos = code >>> 8;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
          for (; ;) {
            int delta = 0;
            for (; ;) {
              byte b = arr[pos++];
              delta = (delta << 7) | (b & 0x7f);
              if ((b & 0x80) == 0) break;
            }
            if (delta == 0) break;
            tnum += delta - TNUM_OFFSET;
            counts[tnum]++;
            for (FieldFacetStats f : finfo) {
              f.facetTermNum(doc, tnum);
            }
          }
        } else {
          int tnum = 0;
          int delta = 0;
          for (; ;) {
            delta = (delta << 7) | (code & 0x7f);
            if ((code & 0x80) == 0) {
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              counts[tnum]++;
              for (FieldFacetStats f : finfo) {
                f.facetTermNum(doc, tnum);
              }
              delta = 0;
            }
            code >>>= 8;
          }
        }
      }
    }

    // add results in index order

    for (i = 0; i < numTermsInField; i++) {
      int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
      if (c == 0) continue;
      Double value = Double.parseDouble(ft.indexedToReadable(getTermText(te, i)));
      allstats.accumulate(value, c);
      //as we've parsed the termnum into a value, lets also accumulate fieldfacet statistics
      for (FieldFacetStats f : finfo) {
        f.accumulateTermNum(i, value);
      }
    }
    te.close();

    int c = missing.size();
    allstats.addMissing(c);

    if (finfo.length > 0) {
      allstats.facets = new HashMap<String, Map<String, StatsValues>>();
      for (FieldFacetStats f : finfo) {
        Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
        FieldType facetType = searcher.getSchema().getFieldType(f.name);
        for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
          String termLabel = entry.getKey();
          int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
          entry.getValue().addMissing(missingCount);
        }
        allstats.facets.put(f.name, facetStatsValues);
      }
    }
View Full Code Here

        String[] facets = params.getFieldParams(f, StatsParams.STATS_FACET);
        if (facets == null) {
          facets = new String[0]; // make sure it is something...
        }
        SchemaField sf = searcher.getSchema().getField(f);
        FieldType ft = sf.getType();
        NamedList stv;

        // Currently, only UnInvertedField can deal with multi-part trie fields
        String prefix = TrieField.getMainValuePrefix(ft);

        if (sf.multiValued() || ft.multiValuedFieldCache() || prefix!=null) {
          //use UnInvertedField for multivalued fields
          UnInvertedField uif = UnInvertedField.getUnInvertedField(f, searcher);
          stv = uif.getStats(searcher, docs, facets).getStatsValues();
        } else {
          stv = getFieldCacheStats(f, facets);
View Full Code Here

    }
    return res;
  }
 
  public NamedList getFieldCacheStats(String fieldName, String[] facet ) {
    FieldType ft = searcher.getSchema().getFieldType(fieldName);

    FieldCache.StringIndex si = null;
    try {
      si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), fieldName);
    }
View Full Code Here

        FieldComparator comparator = null;
        FieldComparator comparators[] = (readers==null) ? null : new FieldComparator[readers.length];

        String fieldname = sortField.getField();
        FieldType ft = fieldname==null ? null : req.getSchema().getFieldTypeNoEx(fieldname);

        DocList docList = rb.getResults().docList;
        ArrayList<Object> vals = new ArrayList<Object>(docList.size());
        DocIterator it = rb.getResults().docList.iterator();

        int offset = 0;
        int idx = 0;

        while(it.hasNext()) {
          int doc = it.nextDoc();
          if (readers != null) {
            idx = SolrIndexReader.readerIndex(doc, offsets);
            subReader = readers[idx];
            offset = offsets[idx];
            comparator = comparators[idx];
          }

          if (comparator == null) {
            comparator = sortField.getComparator(1,0);
            comparator.setNextReader(subReader, offset);
            if (comparators != null)
              comparators[idx] = comparator;
          }

          doc -= offset;  // adjust for what segment this is in
          comparator.copy(0, doc);
          Object val = comparator.value(0);

          // Sortable float, double, int, long types all just use a string
          // comparator. For these, we need to put the type into a readable
          // format.  One reason for this is that XML can't represent all
          // string values (or even all unicode code points).
          // indexedToReadable() should be a no-op and should
          // thus be harmless anyway (for all current ways anyway)
          if (val instanceof String) {
            field.setValue((String)val);
            val = ft.toObject(field);
          }

          vals.add(val);
        }
View Full Code Here

        }
        String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
        boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
        for (int j = 0; j < fields.length; j++) {
          String field = StringHelper.intern(fields[j]);
          FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
          if (ft==null) ft = new StrField();

          // If no lower bound was specified, use the prefix
          String lower = lowerStr==null ? prefix : (raw ? lowerStr : ft.toInternal(lowerStr));
          if (lower == null) lower="";
          String upper = upperStr==null ? null : (raw ? upperStr : ft.toInternal(upperStr));

          Term lowerTerm = new Term(field, lower);
          Term upperTerm = upper==null ? null : new Term(field, upper);
         
          TermEnum termEnum = rb.req.getSearcher().getReader().terms(lowerTerm); //this will be positioned ready to go
          int i = 0;
          BoundedTreeSet<CountPair<String, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<String, Integer>>(limit) : null);
          NamedList fieldTerms = new NamedList();
          terms.add(field, fieldTerms);
          Term lowerTestTerm = termEnum.term();

          //Only advance the enum if we are excluding the lower bound and the lower Term actually matches
          if (lowerTestTerm!=null && lowerIncl == false && lowerTestTerm.field() == field  // intern'd comparison
                  && lowerTestTerm.text().equals(lower)) {
            termEnum.next();
          }

          while (i<limit || sort) {

            Term theTerm = termEnum.term();

            // check for a different field, or the end of the index.
            if (theTerm==null || field != theTerm.field())  // intern'd comparison
              break;

            String indexedText = theTerm.text();

            // stop if the prefix doesn't match
            if (prefix != null && !indexedText.startsWith(prefix)) break;

            if (upperTerm != null) {
              int upperCmp = theTerm.compareTo(upperTerm);
              // if we are past the upper term, or equal to it (when don't include upper) then stop.
              if (upperCmp>0 || (upperCmp==0 && !upperIncl)) break;
            }

            // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
            int docFreq = termEnum.docFreq();
            if (docFreq >= freqmin && docFreq <= freqmax) {
              // add the term to the list
              String label = raw ? indexedText : ft.indexedToReadable(indexedText);
              if (sort) {
                queue.add(new CountPair<String, Integer>(label, docFreq));
              } else {
                fieldTerms.add(label, docFreq);
                i++;
View Full Code Here

      //there should only be one
      if (queryConverters.size() == 1) {
        queryConverter = queryConverters.values().iterator().next();
        IndexSchema schema = core.getSchema();
        String fieldTypeName = (String) initParams.get("queryAnalyzerFieldType");
        FieldType fieldType = schema.getFieldTypes().get(fieldTypeName);
        Analyzer analyzer = fieldType == null ? new WhitespaceAnalyzer()
                : fieldType.getQueryAnalyzer();
        //TODO: There's got to be a better way!  Where's Spring when you need it?
        queryConverter.setAnalyzer(analyzer);
      }
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.solr.schema.FieldType$DefaultAnalyzer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.