Examples of org.apache.solr.schema.FieldType$DefaultAnalyzer

org.apache.solr.schema.FieldType
Default analyzer for types that only produce 1 verbatim token... A maximum size of chars to be read must be specified

        continue;
      }


      SchemaField sf = schema.getFieldOrNull(field);
      if (sf == null) {
        FieldType ft = new StrField();
        sf = new SchemaField(field, ft);
      }


      // if we got the list of fields from the index, only list stored fields
      if (returnFields==null && sf != null && !sf.stored()) {

View Full Code Here

        // To keep the response format compatable with trunk.
        // In trunk MutableValue can convert an indexed value to its native type. E.g. string to int
        // The only option I currently see is the use the FieldType for this
        if (group.groupValue != null) {
          SchemaField schemaField = searcher.getSchema().getField(groupBy);
          FieldType fieldType = schemaField.getType();
          String readableValue = fieldType.indexedToReadable(group.groupValue);
          Fieldable field = schemaField.createField(readableValue, 0.0f);
          nl.add("groupValue", fieldType.toObject(field));
        } else {
          nl.add("groupValue", null);
        }


        addDocList(nl, group);

View Full Code Here

  
  public void inform(SolrCore core)
  {
    String a = initArgs.get( FIELD_TYPE );
    if( a != null ) {
      FieldType ft = core.getSchema().getFieldTypes().get( a );
      if( ft == null ) {
        throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,
            "Unknown FieldType: '"+a+"' used in QueryElevationComponent" );
      }
      analyzer = ft.getQueryAnalyzer();
    }


    SchemaField sf = core.getSchema().getUniqueKeyField();
    if( sf == null || !(sf.getType() instanceof StrField)) {
      throw new SolrException( SolrException.ErrorCode.SERVER_ERROR,

View Full Code Here




  public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException {
    use.incrementAndGet();


    FieldType ft = searcher.getSchema().getFieldType(field);


    NamedList res = new NamedList();  // order is important


    DocSet docs = baseDocs;
    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();


    if (baseSize >= mincount) {


      final int[] index = this.index;
      final int[] counts = new int[numTermsInField];


      //
      // If there is prefix, find it's start and end term numbers
      //
      int startTerm = 0;
      int endTerm = numTermsInField;  // one past the end


      NumberedTermEnum te = ti.getEnumerator(searcher.getReader());
      if (prefix != null && prefix.length() > 0) {
        te.skipTo(prefix);
        startTerm = te.getTermNumber();
        te.skipTo(prefix + "\uffff\uffff\uffff\uffff");
        endTerm = te.getTermNumber();
      }


      /***********
      // Alternative 2: get the docSet of the prefix (could take a while) and
      // then do the intersection with the baseDocSet first.
      if (prefix != null && prefix.length() > 0) {
        docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs);
        // The issue with this method are problems of returning 0 counts for terms w/o
        // the prefix.  We can't just filter out those terms later because it may
        // mean that we didn't collect enough terms in the queue (in the sorted case).
      }
      ***********/


      boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0
              && startTerm==0 && endTerm==numTermsInField
              && docs instanceof BitDocSet;


      if (doNegative) {
        OpenBitSet bs = (OpenBitSet)((BitDocSet)docs).getBits().clone();
        bs.flip(0, maxDoc);
        // TODO: when iterator across negative elements is available, use that
        // instead of creating a new bitset and inverting.
        docs = new BitDocSet(bs, maxDoc - baseSize);
        // simply negating will mean that we have deleted docs in the set.
        // that should be OK, as their entries in our table should be empty.
      }


      // For the biggest terms, do straight set intersections
      for (TopTerm tt : bigTerms.values()) {
        // TODO: counts could be deferred if sorted==false
        if (tt.termNum >= startTerm && tt.termNum < endTerm) {
          counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
        }
      }


      // TODO: we could short-circuit counting altogether for sorted faceting
      // where we already have enough terms from the bigTerms


      // TODO: we could shrink the size of the collection array, and
      // additionally break when the termNumber got above endTerm, but
      // it would require two extra conditionals in the inner loop (although
      // they would be predictable for the non-prefix case).
      // Perhaps a different copy of the code would be warranted.


      if (termInstances > 0) {
        DocIterator iter = docs.iterator();
        while (iter.hasNext()) {
          int doc = iter.nextDoc();
          int code = index[doc];


          if ((code & 0xff)==1) {
            int pos = code>>>8;
            int whichArray = (doc >>> 16) & 0xff;
            byte[] arr = tnums[whichArray];
            int tnum = 0;
            for(;;) {
              int delta = 0;
              for(;;) {
                byte b = arr[pos++];
                delta = (delta << 7) | (b & 0x7f);
                if ((b & 0x80) == 0) break;
              }
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              counts[tnum]++;
            }
          } else {
            int tnum = 0;
            int delta = 0;
            for (;;) {
              delta = (delta << 7) | (code & 0x7f);
              if ((code & 0x80)==0) {
                if (delta==0) break;
                tnum += delta - TNUM_OFFSET;
                counts[tnum]++;
                delta = 0;
              }
              code >>>= 8;
            }
          }
        }
      }


      int off=offset;
      int lim=limit>=0 ? limit : Integer.MAX_VALUE;


      if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
        int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
        maxsize = Math.min(maxsize, numTermsInField);
        final BoundedTreeSet<Long> queue = new BoundedTreeSet<Long>(maxsize);
        int min=mincount-1;  // the smallest value in the top 'N' values
        for (int i=startTerm; i<endTerm; i++) {
          int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
          if (c>min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).


            // minimize object creation and speed comparison by creating a long that
            // encompasses both count and term number.
            // Since smaller values are kept in the TreeSet, make higher counts smaller.
            //
            //   for equal counts, lower term numbers
            // should come first and hence be "greater"


            //long pair = (((long)c)<<32) | (0x7fffffff-i) ;   // use if priority queue
            long pair = (((long)-c)<<32) | i;
            queue.add(new Long(pair));
            if (queue.size()>=maxsize) min=-(int)(queue.last().longValue() >>> 32);
          }
        }


        final int[] tnums = new int[Math.min(Math.max(0, queue.size()-off), lim)];
        final int[] indirect = counts;  // reuse the counts array for the index into the tnums array
        assert indirect.length >= tnums.length;
        
        int tnumCount = 0;


        for (Long p : queue) {
          if (--off>=0) continue;
          if (--lim<0) break;
          int c = -(int)(p.longValue() >>> 32);
          //int tnum = 0x7fffffff - (int)p.longValue();  // use if priority queue
          int tnum = (int)p.longValue();
          indirect[tnumCount] = tnumCount;
          tnums[tnumCount++] = tnum;
          // String label = ft.indexedToReadable(getTermText(te, tnum));
          // add a null label for now... we'll fill it in later.
          res.add(null, c);
        }


        // now sort the indexes by the term numbers
        PrimUtils.sort(0, tnumCount, indirect, new PrimUtils.IntComparator() {
          @Override
          public int compare(int a, int b) {
            return tnums[a] - tnums[b];
          }
        });


        // convert the term numbers to term values and set as the label
        for (int i=0; i<tnumCount; i++) {
          int idx = indirect[i];
          int tnum = tnums[idx];
          String label = ft.indexedToReadable(getTermText(te, tnum));          
          res.setName(idx, label);
        }


      } else {
        // add results in index order
        int i=startTerm;
        if (mincount<=0) {
          // if mincount<=0, then we won't discard any terms and we know exactly
          // where to start.
          i=startTerm+off;
          off=0;
        }


        for (; i<endTerm; i++) {
          int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
          if (c<mincount || --off>=0) continue;
          if (--lim<0) break;


          String label = ft.indexedToReadable(getTermText(te, i));
          res.add(label, c);
        }
      }


      te.close();

View Full Code Here

    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();


    if (baseSize <= 0) return allstats;


    FieldType ft = searcher.getSchema().getFieldType(field);


    DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(field, null, null, false, false)) );


    int i = 0;
    final FieldFacetStats[] finfo = new FieldFacetStats[facet.length];
    //Initialize facetstats, if facets have been passed in
    FieldCache.StringIndex si;
    for (String f : facet) {
      FieldType facet_ft = searcher.getSchema().getFieldType(f);
      try {
        si = FieldCache.DEFAULT.getStringIndex(searcher.getReader(), f);
      }
      catch (IOException e) {
        throw new RuntimeException("failed to open field cache for: " + f, e);
      }
      finfo[i] = new FieldFacetStats(f, si, facet_ft, numTermsInField);
      i++;
    }


    final int[] index = this.index;
    final int[] counts = new int[numTermsInField];//keep track of the number of times we see each word in the field for all the documents in the docset


    NumberedTermEnum te = ti.getEnumerator(searcher.getReader());




    boolean doNegative = false;
    if (finfo.length == 0) {
      //if we're collecting statistics with a facet field, can't do inverted counting
      doNegative = baseSize > maxDoc >> 1 && termInstances > 0
              && docs instanceof BitDocSet;
    }


    if (doNegative) {
      OpenBitSet bs = (OpenBitSet) ((BitDocSet) docs).getBits().clone();
      bs.flip(0, maxDoc);
      // TODO: when iterator across negative elements is available, use that
      // instead of creating a new bitset and inverting.
      docs = new BitDocSet(bs, maxDoc - baseSize);
      // simply negating will mean that we have deleted docs in the set.
      // that should be OK, as their entries in our table should be empty.
    }


    // For the biggest terms, do straight set intersections
    for (TopTerm tt : bigTerms.values()) {
      // TODO: counts could be deferred if sorted==false
      if (tt.termNum >= 0 && tt.termNum < numTermsInField) {
        if (finfo.length == 0) {
          counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
        } else {
          //COULD BE VERY SLOW
          //if we're collecting stats for facet fields, we need to iterate on all matching documents
          DocSet bigTermDocSet = searcher.getDocSet(new TermQuery(tt.term)).intersection(docs);
          DocIterator iter = bigTermDocSet.iterator();
          while (iter.hasNext()) {
            int doc = iter.nextDoc();
            counts[tt.termNum]++;
            for (FieldFacetStats f : finfo) {
              f.facetTermNum(doc, tt.termNum);
            }
          }
        }
      }
    }




    if (termInstances > 0) {
      DocIterator iter = docs.iterator();
      while (iter.hasNext()) {
        int doc = iter.nextDoc();
        int code = index[doc];


        if ((code & 0xff) == 1) {
          int pos = code >>> 8;
          int whichArray = (doc >>> 16) & 0xff;
          byte[] arr = tnums[whichArray];
          int tnum = 0;
          for (; ;) {
            int delta = 0;
            for (; ;) {
              byte b = arr[pos++];
              delta = (delta << 7) | (b & 0x7f);
              if ((b & 0x80) == 0) break;
            }
            if (delta == 0) break;
            tnum += delta - TNUM_OFFSET;
            counts[tnum]++;
            for (FieldFacetStats f : finfo) {
              f.facetTermNum(doc, tnum);
            }
          }
        } else {
          int tnum = 0;
          int delta = 0;
          for (; ;) {
            delta = (delta << 7) | (code & 0x7f);
            if ((code & 0x80) == 0) {
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              counts[tnum]++;
              for (FieldFacetStats f : finfo) {
                f.facetTermNum(doc, tnum);
              }
              delta = 0;
            }
            code >>>= 8;
          }
        }
      }
    }


    // add results in index order


    for (i = 0; i < numTermsInField; i++) {
      int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
      if (c == 0) continue;
      Double value = Double.parseDouble(ft.indexedToReadable(getTermText(te, i)));
      allstats.accumulate(value, c);
      //as we've parsed the termnum into a value, lets also accumulate fieldfacet statistics
      for (FieldFacetStats f : finfo) {
        f.accumulateTermNum(i, value);
      }
    }
    te.close();


    int c = missing.size();
    allstats.addMissing(c);


    if (finfo.length > 0) {
      allstats.facets = new HashMap<String, Map<String, StatsValues>>();
      for (FieldFacetStats f : finfo) {
        Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
        FieldType facetType = searcher.getSchema().getFieldType(f.name);
        for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
          String termLabel = entry.getKey();
          int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
          entry.getValue().addMissing(missingCount);
        }
        allstats.facets.put(f.name, facetStatsValues);
      }
    }

View Full Code Here

  public void testStandardTokenizerVersions() throws Exception {
    assertEquals(DEFAULT_VERSION, solrConfig.luceneMatchVersion);
    
    final IndexSchema schema = h.getCore().getSchema();
    
    FieldType type = schema.getFieldType("textDefault");
    TokenizerChain ana = (TokenizerChain) type.getAnalyzer();
    assertEquals(DEFAULT_VERSION, ((BaseTokenizerFactory) ana.getTokenizerFactory()).luceneMatchVersion);
    assertEquals(DEFAULT_VERSION, ((BaseTokenFilterFactory) ana.getTokenFilterFactories()[2]).luceneMatchVersion);
    TokenizerChain.TokenStreamInfo tsi = ana.getStream("textDefault",new StringReader(""));
    StandardTokenizer tok = (StandardTokenizer) tsi.getTokenizer();
    assertTrue(tok.isReplaceInvalidAcronym());
    
    type = schema.getFieldType("text20");
    ana = (TokenizerChain) type.getAnalyzer();
    assertEquals(Version.LUCENE_20, ((BaseTokenizerFactory) ana.getTokenizerFactory()).luceneMatchVersion);
    assertEquals(Version.LUCENE_24, ((BaseTokenFilterFactory) ana.getTokenFilterFactories()[2]).luceneMatchVersion);
    tsi = ana.getStream("text20",new StringReader(""));
    tok = (StandardTokenizer) tsi.getTokenizer();
    assertFalse(tok.isReplaceInvalidAcronym());


    // this is a hack to get the private matchVersion field in StandardAnalyzer's superclass, may break in later lucene versions - we have no getter :(
    final Field matchVersionField = StandardAnalyzer.class.getSuperclass().getDeclaredField("matchVersion");
    matchVersionField.setAccessible(true);


    type = schema.getFieldType("textStandardAnalyzerDefault");
    Analyzer ana1 = type.getAnalyzer();
    assertTrue(ana1 instanceof StandardAnalyzer);
    assertEquals(DEFAULT_VERSION, matchVersionField.get(ana1));


    type = schema.getFieldType("textStandardAnalyzer20");
    ana1 = type.getAnalyzer();
    assertTrue(ana1 instanceof StandardAnalyzer);
    assertEquals(Version.LUCENE_20, matchVersionField.get(ana1));
  }

View Full Code Here

    if (field != null && core.getSchema().getFieldTypeNoEx(field) != null)  {
      analyzer = core.getSchema().getFieldType(field).getQueryAnalyzer();
    }
    fieldTypeName = (String) config.get(FIELD_TYPE);
    if (core.getSchema().getFieldTypes().containsKey(fieldTypeName))  {
      FieldType fieldType = core.getSchema().getFieldTypes().get(fieldTypeName);
      analyzer = fieldType.getQueryAnalyzer();
    }
    if (analyzer == null)   {
      log.info("Using WhitespaceAnalzyer for dictionary: " + name);
      analyzer = new WhitespaceAnalyzer(core.getSolrConfig().luceneMatchVersion);
    }

View Full Code Here

   *
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
  private List<NamedList> convertTokensToNamedLists(final List<AttributeSource> tokenList, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<NamedList>();
    final FieldType fieldType = context.getFieldType();
    final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]);
    
    // sort the tokens by absoulte position
    ArrayUtil.mergeSort(tokens, new Comparator<AttributeSource>() {
      public int compare(AttributeSource a, AttributeSource b) {
        return arrayCompare(
          a.getAttribute(TokenTrackingAttribute.class).getPositions(),
          b.getAttribute(TokenTrackingAttribute.class).getPositions()
        );
      }
      
      private int arrayCompare(int[] a, int[] b) {
        int p = 0;
        final int stop = Math.min(a.length, b.length);
        while(p < stop) {
          int diff = a[p] - b[p];
          if (diff != 0) return diff;
          p++;
        }
        // One is a prefix of the other, or, they are equal:
        return a.length - b.length;
      }
    });


    for (int i = 0; i < tokens.length; i++) {
      AttributeSource token = tokens[i];
      final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
      final String rawText = token.addAttribute(CharTermAttribute.class).toString();


      String text = fieldType.indexedToReadable(rawText);
      tokenNamedList.add("text", text);
      if (!text.equals(rawText)) {
        tokenNamedList.add("raw_text", rawText);
      }

View Full Code Here

  private void loadExternalFileDictionary(SolrCore core) {
    try {


      // Get the field's analyzer
      if (fieldTypeName != null && core.getSchema().getFieldTypeNoEx(fieldTypeName) != null) {
        FieldType fieldType = core.getSchema().getFieldTypes().get(fieldTypeName);
        // Do index-time analysis using the given fieldType's analyzer
        RAMDirectory ramDir = new RAMDirectory();


        LogMergePolicy mp = new LogByteSizeMergePolicy();
        mp.setMergeFactor(300);


        IndexWriter writer = new IndexWriter(
            ramDir,
            new IndexWriterConfig(core.getSolrConfig().luceneMatchVersion, fieldType.getAnalyzer()).
                setMaxBufferedDocs(150).
                setMergePolicy(mp).
                setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        );

View Full Code Here

    testFacetDate(req, "tdate", format.format(dmp.parseMath("/DAY")), "4");
    testFacetDate(req, "tdate", format.format(dmp.parseMath("/DAY+5DAYS")), "2");
  }


  private void checkPrecisionSteps(String fieldType) {
    FieldType type = h.getCore().getSchema().getFieldType(fieldType);
    if (type instanceof TrieField) {
      TrieField field = (TrieField) type;
      assertTrue(field.getPrecisionStep() > 0 && field.getPrecisionStep() < 64);
    }
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.solr.schema.FieldType$DefaultAnalyzer

com.alipay.tiansuan.solrplugin.SelectInLocalFileQParserPlugin

com.chenlb.mmseg4j.solr.MMSegTokenizerFactoryTest

com.cloudera.cdk.morphline.solr.TokenizeTextBuilder$TokenizeText

org.apache.lucene.analysis.Analyzer

org.apache.lucene.analysis.LowerCaseFilter

org.apache.lucene.analysis.Token

org.apache.lucene.analysis.Tokenizer

org.apache.lucene.analysis.TokenStream

org.apache.lucene.document.Field

org.apache.lucene.document.Fieldable

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.