Package org.apache.lucene.store

Examples of org.apache.lucene.store.ByteArrayDataOutput


    return compress(compressor, decompressed);
  }

  static byte[] compress(Compressor compressor, byte[] decompressed) throws IOException {
    byte[] compressed = new byte[decompressed.length * 2 + 16]; // should be enough
    ByteArrayDataOutput out = new ByteArrayDataOutput(compressed);
    compressor.compress(decompressed, 0, decompressed.length, out);
    final int compressedLen = out.getPosition();
    return Arrays.copyOf(compressed, compressedLen);
  }
View Full Code Here


      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder =
        new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
     
      BytesRef scratch = new BytesRef(64);
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

      final Set<Integer> dedupSet;

      if (dedup) {
        dedupSet = new HashSet<Integer>();
      } else {
        dedupSet = null;
      }

      final byte[] spare = new byte[5];
     
      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRef scratchIntsRef = new IntsRef();
     
      //System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
       
        scratch.grow(estimatedSize);
        scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
        assert scratch.offset == 0;

        // now write our output data:
        int count = 0;
        for (int i = 0; i < numEntries; i++) {
          if (dedupSet != null) {
            // box once
            final Integer ent = output.ords.get(i);
            if (dedupSet.contains(ent)) {
              continue;
            }
            dedupSet.add(ent);
          }
          scratchOutput.writeVInt(output.ords.get(i));  
          count++;
        }

        final int pos = scratchOutput.getPosition();
        scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
        final int pos2 = scratchOutput.getPosition();
        final int vIntLen = pos2-pos;

        // Move the count + includeOrig to the front of the byte[]:
        System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
        System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
        System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);

        if (dedupSet != null) {
          dedupSet.clear();
        }
       
        scratch.length = scratchOutput.getPosition() - scratch.offset;
        //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
        builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
      }
     
      FST<BytesRef> fst = builder.finish();
View Full Code Here

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    boolean success = false;
    byte buffer[] = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef surfaceForm;

      while ((surfaceForm = iterator.next()) != null) {
        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
       
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());

        for (IntsRef path : paths) {

          Util.toBytesRef(path, scratch);
         
          // length of the analyzed text (FST input)
          if (scratch.length > Short.MAX_VALUE-2) {
            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
          }
          short analyzedLength = (short) scratch.length;

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE-2)) {
              throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
            }
            payload = payloads.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }
         
          buffer = ArrayUtil.grow(buffer, requiredLength);
         
          output.reset(buffer);

          output.writeShort(analyzedLength);

          output.writeBytes(scratch.bytes, scratch.offset, scratch.length);

          output.writeInt(encodeWeight(iterator.weight()));

          if (hasPayloads) {
            for(int i=0;i<surfaceForm.length;i++) {
              if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
              }
            }
            output.writeShort((short) surfaceForm.length);
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
            output.writeBytes(payload.bytes, payload.offset, payload.length);
          } else {
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
          }

          assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;

          writer.write(buffer, 0, output.getPosition());
        }
      }
      writer.close();

      // Sort all input/output pairs (required by FST.Builder):
View Full Code Here

    boolean crossProduct = args[2].equals("Y");
   
    int numLines = Integer.parseInt(args[3]);
    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
   
    for (int i = 0; i < numLines; i++) {
      assert affixWriter.getPosition() == currentAffix << 3;
      String line = reader.readLine();
      String ruleArgs[] = line.split("\\s+");

      // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
      // condition is optional
      if (ruleArgs.length < 4) {
          throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
      }
     
      char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
      String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
      String affixArg = ruleArgs[3];
      char appendFlags[] = null;
     
      int flagSep = affixArg.lastIndexOf('/');
      if (flagSep != -1) {
        String flagPart = affixArg.substring(flagSep + 1);
        affixArg = affixArg.substring(0, flagSep);

        if (aliasCount > 0) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        }
       
        appendFlags = flagParsingStrategy.parseFlags(flagPart);
        Arrays.sort(appendFlags);
        twoStageAffix = true;
      }
     
      // TODO: add test and fix zero-affix handling!

      String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
      // at least the gascon affix file has this issue
      if (condition.startsWith("[") && !condition.endsWith("]")) {
        condition = condition + "]";
      }
      // "dash hasn't got special meaning" (we must escape it)
      if (condition.indexOf('-') >= 0) {
        condition = condition.replace("-", "\\-");
      }

      final String regex;
      if (".".equals(condition)) {
        regex = ".*"; // Zero condition is indicated by dot
      } else if (condition.equals(strip)) {
        regex = ".*"; // TODO: optimize this better:
                      // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                      // but this is complicated...
      } else {
        regex = String.format(Locale.ROOT, conditionPattern, condition);
      }
     
      // deduplicate patterns
      Integer patternIndex = seenPatterns.get(regex);
      if (patternIndex == null) {
        patternIndex = patterns.size();
        if (patternIndex > Short.MAX_VALUE) {
          throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");         
        }
        seenPatterns.put(regex, patternIndex);
        CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
        patterns.add(pattern);
      }
     
      Integer stripOrd = seenStrips.get(strip);
      if (stripOrd == null) {
        stripOrd = seenStrips.size();
        seenStrips.put(strip, stripOrd);
        if (stripOrd > Character.MAX_VALUE) {
          throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
        }
      }

      if (appendFlags == null) {
        appendFlags = NOFLAGS;
      }
     
      encodeFlags(scratch, appendFlags);
      int appendFlagsOrd = flagLookup.add(scratch);
      if (appendFlagsOrd < 0) {
        // already exists in our hash
        appendFlagsOrd = (-appendFlagsOrd)-1;
      } else if (appendFlagsOrd > Short.MAX_VALUE) {
        // this limit is probably flexible, but its a good sanity check too
        throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
      }
     
      affixWriter.writeShort((short)flag);
      affixWriter.writeShort((short)stripOrd.intValue());
      // encode crossProduct into patternIndex
      int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
      affixWriter.writeShort((short)patternOrd);
      affixWriter.writeShort((short)appendFlagsOrd);
     
      if (needsInputCleaning) {
        CharSequence cleaned = cleanInput(affixArg, sb);
        affixArg = cleaned.toString();
      }
View Full Code Here

    // If negative floats are allowed some trickery needs to be done to find their byte order.
    boolean success = false;
    count = 0;
    try {
      byte [] buffer = new byte [0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef spare;
      while ((spare = iterator.next()) != null) {
        if (spare.length + 4 >= buffer.length) {
          buffer = ArrayUtil.grow(buffer, spare.length + 4);
        }

        output.reset(buffer);
        output.writeInt(encodeWeight(iterator.weight()));
        output.writeBytes(spare.bytes, spare.offset, spare.length);
        writer.write(buffer, 0, output.getPosition());
      }
      writer.close();

      // We don't know the distribution of scores and we need to bucket them, so we'll sort
      // and divide into equal buckets.
View Full Code Here

      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder =
        new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
     
      BytesRef scratch = new BytesRef(64);
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

      final Set<Integer> dedupSet;

      if (dedup) {
        dedupSet = new HashSet<>();
      } else {
        dedupSet = null;
      }

      final byte[] spare = new byte[5];
     
      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRef scratchIntsRef = new IntsRef();
     
      //System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
       
        scratch.grow(estimatedSize);
        scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
        assert scratch.offset == 0;

        // now write our output data:
        int count = 0;
        for (int i = 0; i < numEntries; i++) {
          if (dedupSet != null) {
            // box once
            final Integer ent = output.ords.get(i);
            if (dedupSet.contains(ent)) {
              continue;
            }
            dedupSet.add(ent);
          }
          scratchOutput.writeVInt(output.ords.get(i));  
          count++;
        }

        final int pos = scratchOutput.getPosition();
        scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
        final int pos2 = scratchOutput.getPosition();
        final int vIntLen = pos2-pos;

        // Move the count + includeOrig to the front of the byte[]:
        System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
        System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
        System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);

        if (dedupSet != null) {
          dedupSet.clear();
        }
       
        scratch.length = scratchOutput.getPosition() - scratch.offset;
        //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
        builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
      }
     
      FST<BytesRef> fst = builder.finish();
View Full Code Here

  public static void beforeClass() throws IOException {
    Random random = random();
    INTS = new int[COUNT];
    LONGS = new long[COUNT];
    RANDOM_TEST_BYTES = new byte[COUNT * (5 + 4 + 9 + 8)];
    final ByteArrayDataOutput bdo = new ByteArrayDataOutput(RANDOM_TEST_BYTES);
    for (int i = 0; i < COUNT; i++) {
      final int i1 = INTS[i] = random.nextInt();
      bdo.writeVInt(i1);
      bdo.writeInt(i1);

      final long l1;
      if (rarely()) {
        // a long with lots of zeroes at the end
        l1 = LONGS[i] = TestUtil.nextLong(random, 0, Integer.MAX_VALUE) << 32;
      } else {
        l1 = LONGS[i] = TestUtil.nextLong(random, 0, Long.MAX_VALUE);
      }
      bdo.writeVLong(l1);
      bdo.writeLong(l1);
    }
  }
View Full Code Here

      //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
      // create the header
      // TODO: clean this up: or just rewind+reuse and deal with it
      byte header[] = new byte[MAX_HEADER_SIZE];
      ByteArrayDataOutput bad = new ByteArrayDataOutput(header);
      // write a "false" first arc:
      bad.writeByte(ARCS_AS_FIXED_ARRAY);
      bad.writeVInt(nodeIn.numArcs);
      bad.writeVInt(maxBytesPerArc);
      int headerLen = bad.getPosition();
     
      final long fixedArrayStart = startAddress + headerLen;

      // expand the arcs in place, backwards
      long srcPos = bytes.getPosition();
View Full Code Here

        .setMergePolicy(newLogMergePolicy(false, 10))
        .setOpenMode(IndexWriterConfig.OpenMode.CREATE));

    Document doc = new Document();
    byte bytes[] = new byte[4];
    ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes);
    BytesRef data = new BytesRef(bytes);
    BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);
    doc.add(dvField);
   
    for (int i = 0; i < Integer.MAX_VALUE; i++) {
      encoder.reset(bytes);
      encoder.writeVInt(i % 65535); // 1, 2, or 3 bytes
      data.length = encoder.getPosition();
      w.addDocument(doc);
      if (i % 100000 == 0) {
        System.out.println("indexed: " + i);
        System.out.flush();
      }
View Full Code Here

    return compress(compressor, decompressed, off, len);
  }

  static byte[] compress(Compressor compressor, byte[] decompressed, int off, int len) throws IOException {
    byte[] compressed = new byte[len * 2 + 16]; // should be enough
    ByteArrayDataOutput out = new ByteArrayDataOutput(compressed);
    compressor.compress(decompressed, off, len, out);
    final int compressedLen = out.getPosition();
    return Arrays.copyOf(compressed, compressedLen);
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.store.ByteArrayDataOutput

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.