Package uk.ac.ucl.panda.utility.structure

Examples of uk.ac.ucl.panda.utility.structure.Document


      throw new IOException(indexDir + " does not exist or is not a directory");
     
    }
   
               
    Document doc =new Document();

   
    TrecDoc docMaker = new TrecDoc(data);
      
View Full Code Here


  }

  // append fields from storedFieldReaders
  public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
    ensureOpen();
    Document result = new Document();
    for (int i = 0; i < storedFieldReaders.size(); i++) {
      IndexReader reader = (IndexReader)storedFieldReaders.get(i);

      boolean include = (fieldSelector==null);
      if (!include) {
        Iterator it = ((Collection) readerToFields.get(reader)).iterator();
        while (it.hasNext())
          if (fieldSelector.accept((String)it.next())!=FieldSelectorResult.NO_LOAD) {
            include = true;
            break;
          }
      }
      if (include) {
        Iterator fieldIterator = reader.document(n, fieldSelector).getFields().iterator();
        while (fieldIterator.hasNext()) {
          result.add((Fieldable)fieldIterator.next());
        }
      }
    }
    return result;
  }
View Full Code Here

    resetLeftovers();
    DocData docData = getNextDocData();
    ///////////////////
    if(docData==null)return null;
    ///////////////////
    Document doc = createDocument(docData,0,-1);
    return doc;
  }
View Full Code Here

  // create a doc
  // use only part of the body, modify it to keep the rest (or use all if size==0).
  // reset the docdata properties so they are not added more than once.
  private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
    int docid = incrNumDocsCreated();
    Document doc = new Document();
    doc.add(new Field(ID_FIELD, docid+"", storeVal, indexVal, termVecVal));
    if (docData.getName()!=null) {
      String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
     ///////////////////
      doc.add(new Field(NAME_FIELD, name, Field.Store.YES,Field.Index.UN_TOKENIZED, termVecVal));
    }
    if (docData.getDate()!=null) {
      String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
      doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal));
    }
    if (docData.getTitle()!=null) {
      doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal));
    }
    if (docData.getBody()!=null && docData.getBody().length()>0) {
      String bdy;
      if (size<=0 || size>=docData.getBody().length()) {
        bdy = docData.getBody(); // use all
        docData.setBody("")// nothing left
      } else {
        // attempt not to break words - if whitespace found within next 20 chars...
        for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
          if (Character.isWhitespace(docData.getBody().charAt(n))) {
            size = n;
            break;
          }
        }
        bdy = docData.getBody().substring(0,size); // use part
        docData.setBody(docData.getBody().substring(size)); // some left
      }
      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, Field.TermVector.YES));
      if (storeBytes == true) {
        doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES));
      }
    }

    if (docData.getProps()!=null) {
      for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
        String key = (String) it.next();
        String val = (String) docData.getProps().get(key);
        doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
      }
      docData.setProps(null);
    }
    //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
    return doc;
View Full Code Here

      DocData dd2 = dd;
      dd = getNextDocData();
      cnt = 0;
      dd.setBody(dd2.getBody() + dd.getBody());
    }
    Document doc = createDocument(dd,size,cnt);
    if (dd.getBody()==null || dd.getBody().length()==0) {
      resetLeftovers();
    } else {
      if (lvr == null) {
        lvr = new LeftOver();
View Full Code Here

  final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
    indexStream.seek((n + docStoreOffset) * 8L);
    long position = indexStream.readLong();
    fieldsStream.seek(position);

    Document doc = new Document();
    int numFields = fieldsStream.readVInt();
    for (int i = 0; i < numFields; i++) {
      int fieldNumber = fieldsStream.readVInt();
      FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
      FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.getName());
View Full Code Here

TOP

Related Classes of uk.ac.ucl.panda.utility.structure.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.