Examples of DocumentMetaData

de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData
lucandra.serializers.thrift.DocumentMetadata
org.apache.ctakes.preprocessor.DocumentMetaData
@author Mayo Clinic
org.exist.dom.DocumentMetadata
org.pentaho.reporting.libraries.docbundle.DocumentMetaData
Provides access to the document's bundle meta-data information.
This class unifies the information from '/mimetype', '/META-INF/manifest.xml' and '/metadata.xml'. If the manifest contains a mime-type declaration for an entry, that mime-type is reported by the repository methods.
The manifest file must follow the specification as outlined in the OpenDocument File format section 17.7. Encryption is not yet supported but may be added later. @author Thomas Morgner

Examples of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData

    }
    catch (CASException e) {
      throw new AnalysisEngineProcessException(e);
    }
    
    DocumentMetaData md1 = DocumentMetaData.get(view1);
    DocumentMetaData md2 = DocumentMetaData.get(view2);    


    try {
      writer.write(md1.getDocumentId() + "\t" + md2.getDocumentId() + LF);
    }
    catch (IOException e) {
      throw new AnalysisEngineProcessException(e);
    }
  }

View Full Code Here

Examples of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData

    }
    catch (CASException e) {
      throw new AnalysisEngineProcessException(e);
    }
    
    DocumentMetaData md1 = JCasUtil.selectSingle(view1, DocumentMetaData.class);
    DocumentMetaData md2 = JCasUtil.selectSingle(view2, DocumentMetaData.class);
    
    TextSimilarityScore score = JCasUtil.selectSingle(jcas, ExperimentalTextSimilarityScore.class);
    
    TextSimilarityScore goldScore = null;
    if (outputGoldScores) {
            goldScore = JCasUtil.selectSingle(jcas, GoldTextSimilarityScore.class);
        } 
    
    try {
      if (outputScoresOnly)
      {
        if (outputGoldScores) {
                    writer.write(score.getScore() + "\t" + goldScore.getScore() + LF);
                }
                else {
                    writer.write(score.getScore() + LF);
                }
      } else {
        if (outputGoldScores) {
          writer.write(md1.getDocumentId() + "\t" + 
               md2.getDocumentId() + "\t" + 
               score.getScore() + "\t" + 
               goldScore.getScore() + LF);
        } else {
          writer.write(md1.getDocumentId() + "\t" + 
               md2.getDocumentId() + "\t" + 
               score.getScore() + LF);
        }
      }
    }
    catch (IOException e) {

View Full Code Here

Examples of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData

  {
      try
      {        
        JCas view1 = jcas.getView(CombinationReader.VIEW_1);
          
          DocumentMetaData md = DocumentMetaData.get(view1);
          
          String docID = md.getDocumentId().substring(md.getDocumentId().indexOf("-") + 1);
          
          EntailmentClassificationOutcome outcome = JCasUtil.selectSingle(jcas, EntailmentClassificationOutcome.class);
//          System.out.println(docID + "::" + outcome.getOutcome());          
          gold.put(docID, outcome.getOutcome());
      }

View Full Code Here

Examples of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData

    }
    
    
    for (JCas view : views)
    {
      DocumentMetaData md = DocumentMetaData.get(view);
      
      File outputFile = new File(outputDir.getAbsolutePath() + "/" + md.getDocumentId() + ".txt"); 
      
      BufferedWriter writer;
      try {
        writer = new BufferedWriter(new FileWriter(outputFile));

View Full Code Here

Examples of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData

            Annotation coveringAnnotation2)
    throws SimilarityException
  {
    // The feature generation needs to have happened before!
    
    DocumentMetaData md = DocumentMetaData.get(jcas1);
    int id = Integer.parseInt(md.getDocumentId().substring(md.getDocumentId().indexOf("-") + 1));
    
    System.out.println(id);
    
    Instance testInst = test.get(id - 1);

View Full Code Here

Examples of de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData

            Annotation coveringAnnotation2)
    throws SimilarityException
  {
    // The feature generation needs to have happened before!
    
    DocumentMetaData md = DocumentMetaData.get(jcas1);
    int id = Integer.parseInt(md.getDocumentId());
    
    System.out.println(id);
    
    Instance testInst = test.get(id - 1);

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata


        byte[] indexNameBytes = indexName.getBytes("UTF-8");
        ByteBuffer indexTermsKey = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes,
                "terms".getBytes("UTF-8"));


        DocumentMetadata allIndexedTerms = new DocumentMetadata();
        Map<String, DocumentMetadata> fieldCache = new HashMap<String, DocumentMetadata>(1024);


        // By default we don't handle indexSharding
        // We round robin replace the index
        docNumber = docNumber % CassandraIndexManager.maxDocsPerShard;


        ByteBuffer docId = ByteBuffer.wrap(CassandraUtils.writeVInt(docNumber));
        int position = 0;


        for (Fieldable field : doc.getFields())
        {


            ThriftTerm firstTerm = null;


            // Indexed field
            if (field.isIndexed() && field.isTokenized())
            {
                TokenStream tokens = field.tokenStreamValue();


                if (tokens == null)
                {
                    Reader tokReader = field.readerValue();


                    if (tokReader == null)
                        tokReader = new StringReader(field.stringValue());


                    tokens = analyzer.reusableTokenStream(field.name(), tokReader);
                }


                // collect term information per field
                Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new HashMap<Term, Map<ByteBuffer, List<Number>>>();


                int lastOffset = 0;
                if (position > 0)
                {
                    position += analyzer.getPositionIncrementGap(field.name());
                }


                // Build the termPositions vector for all terms


                tokens.reset(); // reset the TokenStream to the first token


                // set up token attributes we are working on


                // offsets
                OffsetAttribute offsetAttribute = null;
                if (field.isStoreOffsetWithTermVector())
                    offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);


                // positions
                // these are always gathered in later lucene versions
                PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) tokens
                            .addAttribute(PositionIncrementAttribute.class);


                // term as string
                CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);


                // store normalizations of field per term per document rather
                // than per field.
                // this adds more to write but less to read on other side
                Integer tokensInField = new Integer(0);


                while (tokens.incrementToken())
                {
                    tokensInField++;
                    Term term = new Term(field.name(), termAttribute.toString());


                    ThriftTerm tterm = new ThriftTerm(term.field()).setText(
                            ByteBuffer.wrap(term.text().getBytes("UTF-8"))).setIs_binary(false);


                    if (firstTerm == null)
                        firstTerm = tterm;


                    allIndexedTerms.addToTerms(tterm);


                    // fetch all collected information for this term
                    Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);


                    if (termInfo == null)
                    {
                        termInfo = new HashMap<ByteBuffer, List<Number>>();
                        allTermInformation.put(term, termInfo);
                    }


                    // term frequency
                    {
                        List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKeyBytes);


                        if (termFrequency == null)
                        {
                            termFrequency = new ArrayList<Number>();
                            termFrequency.add(new Integer(0));
                            termInfo.put(CassandraUtils.termFrequencyKeyBytes, termFrequency);
                        }


                        // increment
                        termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                    }


                    // position vector
                    {
                        position += (posIncrAttribute.getPositionIncrement() - 1);


                        List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKeyBytes);


                        if (positionVector == null)
                        {
                            positionVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.positionVectorKeyBytes, positionVector);
                        }


                        positionVector.add(++position);
                    }


                    // term offsets
                    if (field.isStoreOffsetWithTermVector())
                    {


                        List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKeyBytes);
                        if (offsetVector == null)
                        {
                            offsetVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.offsetVectorKeyBytes, offsetVector);
                        }


                        offsetVector.add(lastOffset + offsetAttribute.startOffset());
                        offsetVector.add(lastOffset + offsetAttribute.endOffset());


                    }
                }


                List<Number> bnorm = null;
                if (!field.getOmitNorms())
                {
                    bnorm = new ArrayList<Number>();


                    final FieldInvertState invertState = new FieldInvertState();
                    invertState.setBoost(doc.getBoost() * field.getBoost());
                    invertState.setLength(tokensInField);
                    final float norm = similarity.computeNorm(field.name(), invertState);


                    bnorm.add(Similarity.getDefault().encodeNormValue(norm));
                }


                for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet())
                {


                    // Terms are stored within a unique key combination
                    // This is required since cassandra loads all columns
                    // in a key/column family into memory
                    ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, term
                            .getKey().field().getBytes("UTF-8"), CassandraUtils.delimeterBytes, term.getKey().text()
                            .getBytes("UTF-8"));


                    // Mix in the norm for this field alongside each term
                    // more writes but faster on read side.
                    if (!field.getOmitNorms())
                    {
                        term.getValue().put(CassandraUtils.normsKeyBytes, bnorm);
                    }


                    CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                            new LucandraTermInfo(docNumber, term.getValue()).serialize());


                    // Store all terms under a row
                    CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                            CassandraUtils.createColumnName(term.getKey()), indexTermsKey,
                            ByteBufferUtil.EMPTY_BYTE_BUFFER);
                }
            }


            // Untokenized fields go in without a termPosition
            if (field.isIndexed() && !field.isTokenized())
            {
                ThriftTerm tterm = new ThriftTerm(field.name()).setText(
                        ByteBuffer.wrap(field.stringValue().getBytes("UTF-8"))).setIs_binary(false);


                if (firstTerm == null)
                    firstTerm = tterm;


                allIndexedTerms.addToTerms(tterm);


                ByteBuffer key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"),
                        CassandraUtils.delimeterBytes, field.name().getBytes("UTF-8"), CassandraUtils.delimeterBytes,
                        field.stringValue().getBytes("UTF-8"));


                Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                termMap.put(CassandraUtils.termFrequencyKeyBytes, CassandraUtils.emptyArray);
                termMap.put(CassandraUtils.positionVectorKeyBytes, CassandraUtils.emptyArray);


                CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                        new LucandraTermInfo(docNumber, termMap).serialize());


                // Store all terms under a row
                CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                        CassandraUtils.createColumnName(field), indexTermsKey, ByteBufferUtil.EMPTY_BYTE_BUFFER);
            }


            // Stores each field as a column under this doc key
            if (field.isStored())
            {
                ThriftTerm tt = new ThriftTerm(field.name());


                if (field instanceof NumericField)
                {
                    Number n = ((NumericField) field).getNumericValue();
                    switch(((NumericField) field).getDataType())
                    {
                    case LONG: tt.setLongVal(n.longValue()); break;
                    case INT: tt.setIntVal(n.intValue()); break;
                    case FLOAT: tt.setFloatVal(n.floatValue()); break;
                    case DOUBLE: tt.setDoubleVal(n.doubleValue()); break;
                    default: throw new IllegalStateException("Unknown numeric type in field: "+field);
                    };
                }


                byte[] value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");
                tt.setText(ByteBuffer.wrap(value)).setIs_binary(field.isBinary());


                // logic to handle multiple fields w/ same name
                DocumentMetadata currentValue = fieldCache.get(field.name());
                if (currentValue == null)
                {
                    currentValue = new DocumentMetadata();
                    fieldCache.put(field.name(), currentValue);
                }


                currentValue.addToTerms(tt);
            }


            // Store for field cache
            if (firstTerm != null)
            {

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata


        IColumn metaCol = rows.get(0).cf.getColumn(CassandraUtils.documentMetaFieldBytes);
        if (metaCol == null)
            return;


        DocumentMetadata terms = fromBytesUsingThrift(metaCol.value());


        Set<String> fields = new HashSet<String>();


        for (ThriftTerm term : terms.getTerms())
        {
            // remove from field cache
            if (!fields.contains(term.getField()))
            {
                ByteBuffer fieldCacheKey = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes,

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

    }


    /** Read the object from bytes string. */
    public static DocumentMetadata fromBytesUsingThrift(ByteBuffer data) throws IOException
    {
        DocumentMetadata docMeta = new DocumentMetadata();


        byte[] decompressedData = CassandraUtils.decompress(ByteBufferUtil.getArray(data));


        TTransport trans = new TMemoryInputTransport(decompressedData);
        TProtocol deser = protocolFactory.getProtocol(trans);


        try
        {
            docMeta.read(deser);
        }
        catch (TException e)
        {
            throw new IOException(e);
        }

View Full Code Here

Examples of lucandra.serializers.thrift.DocumentMetadata

                        {
                            logger.warn("Filtering out __META__ key");
                            continue;
                        }


                        DocumentMetadata dm = lucandra.IndexWriter.fromBytesUsingThrift(col.value());
                                       
                        for(ThriftTerm term : dm.getTerms())
                        {
                            Fieldable f = null; 
                            
                            if( term.isSetLongVal() )
                            {

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.