Examples of org.apache.lucene.document.Document

org.apache.lucene.document.Document
Documents are the unit of indexing and search. A Document is a set of fields. Each field has a name and a textual value. A field may be {@link Fieldable#isStored() stored} with the document, in whichcase it is returned with search hits on the document. Thus each document should typically contain one or more stored fields which uniquely identify it.
Note that fields which are not {@link Fieldable#isStored() stored} arenot available in documents retrieved from the index, e.g. with {@link ScoreDoc#doc}, {@link Searcher#doc(int)} or {@link IndexReader#document(int)}.

   */
  @Override
  public void printEndTag(PageRequest request, PageResponse response)
    throws RegainException
  {
    Document hit = (Document) request.getContextAttribute(ATTR_CURRENT_HIT);
    if (hit == null) {
      throw new RegainException("Tag " + getTagName()
          + " must be inside a list tag!");
    }

View Full Code Here

      boolean removeOldEntry = false;


      // Search the entry for this URL
      Term urlTerm = new Term("url", rawDocument.getUrl());
      Query query = new TermQuery(urlTerm);
      Document doc;
      try {
        setIndexMode(SEARCHING_MODE);
        TopScoreDocCollector collector = TopScoreDocCollector.create(20, false);
        mIndexSearcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;


        if (hits.length > 0) {
          if (hits.length > 1) {
            for (int i = 1; i < hits.length; i++) {
              markForDeletion(mIndexSearcher.doc(hits[i].doc));
            }
            mLog.warn("There are duplicate entries (" + hits.length + " in " +
                    "total) for " + rawDocument.getUrl() + ". They will be removed.");
          }


          doc = mIndexSearcher.doc(hits[0].doc);
        } else {
          doc = null;
        }
      } catch (IOException exc) {
        throw new RegainException("Searching old index entry failed for " + rawDocument.getUrl(), exc);
      }


      // If we found an entry, check whether it is up-to-date
      if (doc != null) {
        // Get the last modification date from the document
        Date docLastModified = rawDocument.getLastModified();


        if (docLastModified == null) {
          // We are not able to get the last modification date from the
          // document (this happens with all http-URLs)
          // -> Delete the old entry and create a new one
          mLog.info("Don't know when the document was last modified. " +
                  "Creating a new index entry...");
          removeOldEntry = true;


        } else {
          // Compare the modification date with the one from the index entry
          String asString = doc.get("last-modified");
          if (asString != null) {
            long diff = 86400001L;
            Date indexLastModified = null;
            try {
              indexLastModified = DateTools.stringToDate(asString);
              diff = docLastModified.getTime() - indexLastModified.getTime();
            } catch (ParseException parseException) {
              mLog.warn("Couldn't parse last-modified date from index. Document: " +
                      rawDocument.getUrl(), parseException);
            }
            if (diff > 86400000L) {
              // -> The index entry is not up-to-date -> Delete the old entry
              mLog.info("Index entry is outdated. Creating a new one (source=" +
                      docLastModified + "), (index=" + indexLastModified + "): " +
                      rawDocument.getUrl());
              removeOldEntry = true;


            } else if ((new Date().getTime()) - indexLastModified.getTime() < 86400000L) {
              // Spidering at the same day
              // Due to the fuzziness of the docLastModified.getTime() (day accuracy) 
              // we can't be sure whether the document is up-to-date or not
              mLog.info("Index entry is from the same day. Therefore we have to recrawl but do not index the document." +
                      "Creating a new one (source=" + docLastModified + "), (index=" + indexLastModified + "): " +
                      rawDocument.getUrl());


              parseDocument(rawDocument, errorLogger);


              return;


            } else {
              // The index entry is up-to-date


              // Check whether the preparation failed the last time
              boolean failedLastTime = doc.get("preparation-error") != null;
              if (failedLastTime) {
                if (mRetryFailedDocs) {
                  // The entry failed the last time, the user want's a retry
                  // -> We do a retry
                  mLog.info("Retrying preparation of: " + rawDocument.getUrl());

View Full Code Here

          throws RegainException {
    // Dokument erzeugen
    if (mLog.isDebugEnabled()) {
      mLog.debug("Creating document: " + rawDocument.getUrl());
    }
    Document doc = mDocumentFactory.createDocument(rawDocument, errorLogger);


    // Dokument in den Index aufnehmen
    if (doc != null) {
      mAddToIndexProfiler.startMeasuring();
      try {

View Full Code Here

    setIndexMode(READING_MODE);
    int docCount = mIndexReader.numDocs();
    for (int docIdx = 0; docIdx < docCount; docIdx++) {
      if (!mIndexReader.isDeleted(docIdx)) {
        // Document lesen
        Document doc;
        try {
          doc = mIndexReader.document(docIdx);
        } catch (Throwable thr) {
          throw new RegainException("Getting document #" + docIdx + " from index failed.", thr);
        }


        // URL und last-modified holen
        String url = doc.get("url");
        String lastModified = doc.get("last-modified");


        // Prüfen, ob die URL gelöscht werden soll
        boolean shouldBeDeleted;
        if (url != null) {
          // Prüfen, ob dieser Eintrag zum Löschen vorgesehen ist

View Full Code Here

    }


    rawDocument.setMimeType( mimeType );
    
    // Find the preparator that will prepare this URL
    Document doc = null;
    boolean preparatorFound = false;
    ArrayList <Integer>matchingPreperators = new ArrayList <Integer>();
    for (int i = 0; i < mPreparatorArr.length; i++) {
      if (mPreparatorArr[i].accepts(rawDocument)) {
        // This preparator can prepare this URL

View Full Code Here

      throw new RegainException("Preparator " + preparator.getClass().getName()
        + " did not extract the content of " + url);
    }


    // Preparing succeed -> Create the document
    Document doc = createDocument(rawDocument, cleanedContent, title,
                                  summary, metadata, headlines, path, additionalFieldMap);


    // return the document
    return doc;
  }

View Full Code Here

    throws RegainException
  {
    String url = rawDocument.getUrl();


    // Create a new, empty document
    Document doc = new Document();
    
    // Create the auxiliary fields
    // NOTE: We do this at first, because if someone defined an auxiliary field
    //       having the same name as a normal field, then the field will be
    //       overriden by the normal field. This way we can be sure that the
    //       normal fields have the value we expect.
    AuxiliaryField[] auxiliaryFieldArr = mConfig.getAuxiliaryFieldList();
    if (auxiliaryFieldArr != null) {
      for (int i = 0; i < auxiliaryFieldArr.length; i++) {
        RE regex = auxiliaryFieldArr[i].getUrlRegex();
        if (regex.match(url)) {
          String fieldName = auxiliaryFieldArr[i].getFieldName();


          String value = auxiliaryFieldArr[i].getValue();
          if (value == null) {
            // We have no value set -> Extract the value from the regex
            value = regex.getParen(auxiliaryFieldArr[i].getUrlRegexGroup());
          }


          if (value != null) {
            if (auxiliaryFieldArr[i].getToLowerCase()) {
              value = value.toLowerCase();
            }


            if (mLog.isDebugEnabled()) {
              mLog.debug("Adding auxiliary field: " + fieldName + "=" + value);
            }
            boolean store = auxiliaryFieldArr[i].isStored();
            boolean index = auxiliaryFieldArr[i].isIndexed();
            boolean token = auxiliaryFieldArr[i].isTokenized();


            doc.add(new Field(fieldName, value,
                store ? Field.Store.YES : Field.Store.NO,
                index ? (token ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO));
          }
        }
      }
    }
    
    // Add the groups of the document
    if (mCrawlerAccessController != null) {
      String[] groupArr = mCrawlerAccessController.getDocumentGroups(rawDocument);
      
      // Check the Group array
      RegainToolkit.checkGroupArray(mCrawlerAccessController, groupArr);


      // Add the field
      // NOTE: The field "groups" is tokenized, but not stemmed.
      //       See: RegainToolkit.WrapperAnalyzer
      Iterator groupIter = Arrays.asList(groupArr).iterator();
      StringBuilder tokenBuilder = new StringBuilder();
      while (groupIter.hasNext()) {
        tokenBuilder.append((String) groupIter.next());
        tokenBuilder.append(" ");
      }
    
      //doc.add(new Field("groups", new IteratorTokenStream(groupIter)));
      doc.add(new Field("groups", new WhitespaceTokenizer(new StringReader(tokenBuilder.toString()))));
    }


    // Add the URL of the document
    doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
    
    // Add the file name (without protocol, drive-letter and path)
    String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
    doc.add(new Field("filename", new WhitespaceTokenizer(new StringReader(filenameWithVariants))));
    PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);


    // Add the filename field for sorting
    doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));


    // Add the document's size
    int size = rawDocument.getLength();
    doc.add(new Field("size", Integer.toString(size), Field.Store.YES, Field.Index.NOT_ANALYZED));


    // Add the mime-type
    String mimeType = rawDocument.getMimeType();
    doc.add(new Field("mimetype", mimeType, Field.Store.YES, Field.Index.NOT_ANALYZED));
    
    // Add last modified
    Date lastModified = rawDocument.getLastModified();
    if (lastModified == null) {
      // We don't know when the document was last modified
      // -> Take the current time
      lastModified = new Date();
    }
    doc.add(new Field("last-modified", 
      DateTools.dateToString(lastModified, DateTools.Resolution.DAY), Field.Store.YES,
        Field.Index.NOT_ANALYZED));


    // Write the raw content to an analysis file
    writeContentAnalysisFile(rawDocument);
    
    // Add the additional fields
    if (additionalFieldMap != null) {
      Iterator iter = additionalFieldMap.keySet().iterator();
      while (iter.hasNext()) {
        String fieldName = (String) iter.next();
        String fieldValue = (String) additionalFieldMap.get(fieldName);
        //doc.add(new Field(fieldName, fieldValue, Field.Store.COMPRESS, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, fieldValue, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, CompressionTools.compressString(fieldValue), Field.Store.YES));
      }
    }


    if (hasContent(cleanedContent)) {
      // Write the clean content to an analysis file
      writeAnalysisFile(url, "clean", cleanedContent);


      // Add the cleaned content of the document
      doc.add(new Field("content", cleanedContent, 
        this.storeContentForPreview ? Field.Store.YES : Field.Store.NO, Field.Index.ANALYZED));
    } else {
      // We have no content! This is a substitute document
      // -> Add a "preparation-error"-field
      doc.add(new Field("preparation-error", "true", Field.Store.YES,
          Field.Index.NO));
    }


    // Check whether to use the link text as title
    for (int i = 0; i < mUseLinkTextAsTitleReArr.length; i++) {
      if (mUseLinkTextAsTitleReArr[i].match(url)) {
        String linkText = rawDocument.getSourceLinkText();
        if (linkText != null) {
          title = linkText;
        }
        break;
      }
    }


    // Add the document's title
    if (hasContent(title)) {
      doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("title_sort", title.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
    } else {
      doc.add(new Field("title_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }


    // Add the document's summary
    if (! hasContent(summary) && hasContent(cleanedContent)) {
      summary = createSummaryFromContent(cleanedContent);
    }
    if (hasContent(summary)) {
      doc.add(new Field("summary", summary, Field.Store.NO, Field.Index.ANALYZED));
      doc.add(new Field("summary", CompressionTools.compressString(summary), Field.Store.YES));
    }


   // Add the document's metadata
    if (hasContent(metadata)) {
      doc.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));
    }


    // Add the document's headlines
    if (hasContent(headlines)) {
      doc.add(new Field("headlines", headlines, Field.Store.NO,
          Field.Index.ANALYZED));
    }


    // Add the document's path
    if (pfPair.getPath() != null) {
      //String asString = pathToString(path);
      doc.add(new Field("path", pfPair.getPath(), Field.Store.YES, Field.Index.NO));
      doc.add(new Field("path_sort", pfPair.getPath().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));


      // Write the path to an analysis file
      writeAnalysisFile(url, "path", pfPair.getPath());
    } else {
      doc.add(new Field("path_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }


    return doc;
  }

View Full Code Here

  
  public Document[] createDataTwo(){
    ArrayList<Document> dataList=new ArrayList<Document>();
      String color = "red";
      String ID = Integer.toString(10);
      Document d=new Document();
      d.add(new Field("id",ID,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new Field("color",color,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new NumericField("NUM").setIntValue(10));
      dataList.add(d);
      
       color = "green";
       ID = Integer.toString(11);
       d=new Document();
      d.add(new Field("id",ID,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new Field("color",color,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
      d.add(new NumericField("NUM").setIntValue(11));
      dataList.add(d);
      
    
    return dataList.toArray(new Document[dataList.size()]);
}

View Full Code Here

      ArrayList<Document> dataList=new ArrayList<Document>();
      for(int i=0; i<_documentSize; i++)
      {
        String color = (i%2 == 0) ? "red" : "green";
        String ID = Integer.toString(i);
        Document d=new Document();
        d.add(new Field("id",ID,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
        d.add(new Field("color",color,Field.Store.YES,Index.NOT_ANALYZED_NO_NORMS));
        dataList.add(d);
      }
      
      return dataList.toArray(new Document[dataList.size()]);
  }

View Full Code Here

    throws RegainException
  {
    boolean shouldHighlight = results.getShouldHighlight(hitIndex);
    
    try {
      Document hit = results.getHitDocument(hitIndex);
      if (shouldHighlight) {
        results.highlightHitDocument(hitIndex);
      } else {
        results.shortenSummary(hitIndex);
      }
      request.setContextAttribute(ATTR_CURRENT_HIT, hit);
      float score = results.getHitScore(hitIndex);
      request.setContextAttribute(ATTR_CURRENT_HIT_SCORE, new Float(score));
      request.setContextAttribute(ATTR_CURRENT_HIT_INDEX, new Integer(hitIndex));


      String order = request.getParameter("order");
      //System.out.println("order: " + order);
      if (!(order == null || order.length() == 0 || order.startsWith(SortingOption.RELEVANCE))) {
        String fieldName = order.substring(0, order.lastIndexOf("_"));
        //System.out.println("none standard order. fieldname: " + fieldName);
        Field field = hit.getField(fieldName);
        String fieldContent = null;
        if (field != null) {
          fieldContent = field.stringValue();
        }
        if (fieldContent == null) {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.document.Document

com.bericotech.clavin.gazetteer.query.LuceneGazetteer

com.browseengine.bobo.test.BasicIndexingTest

com.browseengine.bobo.test.BoboTestCase

com.browseengine.bobo.test.FacetNotValuesTest

com.gentics.cr.lucene.autocomplete.Autocompleter

com.gentics.cr.lucene.indexer.index.CRLuceneIndexJob

com.gentics.cr.lucene.indexer.index.LuceneIndexUpdateChecker

com.gentics.cr.lucene.search.CRSearcher

com.gentics.cr.lucene.search.query.mocks.SimpleLucene

com.tamingtext.classifier.mlt.TrainMoreLikeThis

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.