Package net.sf.regain.util.io

Examples of net.sf.regain.util.io.PathFilenamePair


    super.tearDown();
  }

  public void testExtractPathAndFilename() throws RegainException {

    PathFilenamePair pfPair = RegainToolkit.fragmentUrl("file:///dir1/dir2/nsis_license.txt");

    System.out.println(pfPair.getPath());
    System.out.println(pfPair.getFilename());
    assertEquals("nsis_license.txt", pfPair.getFilename());
    assertEquals("/dir1/dir2/", pfPair.getPath());

  }
View Full Code Here


   * @throws RegainException.
   */
  public static String urlToWhitespacedFileName(String url) throws RegainException {

    // Cut file name from path
    PathFilenamePair pfPair = fragmentUrl(url);
    String fileName = pfPair.getFilename();
    if (fileName != null) {
      int lastDot = fileName.lastIndexOf(".");
      // Remove the extension
      String fileNameWithoutExtension = "";
      if (lastDot > 0 && lastDot < fileName.length()) {
View Full Code Here

   *
   * @throws RegainException
   */
  public static PathFilenamePair fragmentUrl(String url) throws RegainException {

    PathFilenamePair pfPair = new PathFilenamePair();
    int lastSlash = url.lastIndexOf("/");
    // Cut file name from path
    if (lastSlash > 0 && lastSlash + 1 < url.length()) {
      String fileName = url.substring(lastSlash + 1);
      String path = url.substring(0, lastSlash + 1);
      path = removeProtocol(path);
      pfPair.setFilename(fileName);
      pfPair.setPath(path);
    } else {
      pfPair.setPath("");
      pfPair.setFilename("");
    }

    return pfPair;
  }
View Full Code Here

    doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
   
    // Add the file name (without protocol, drive-letter and path)
    String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
    doc.add(new Field("filename", new WhitespaceTokenizer(new StringReader(filenameWithVariants))));
    PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);

    // Add the filename field for sorting
    doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));

    // Add the document's size
    int size = rawDocument.getLength();
    doc.add(new Field("size", Integer.toString(size), Field.Store.YES, Field.Index.NOT_ANALYZED));

    // Add the mime-type
    String mimeType = rawDocument.getMimeType();
    doc.add(new Field("mimetype", mimeType, Field.Store.YES, Field.Index.NOT_ANALYZED));
   
    // Add last modified
    Date lastModified = rawDocument.getLastModified();
    if (lastModified == null) {
      // We don't know when the document was last modified
      // -> Take the current time
      lastModified = new Date();
    }
    doc.add(new Field("last-modified",
      DateTools.dateToString(lastModified, DateTools.Resolution.DAY), Field.Store.YES,
        Field.Index.NOT_ANALYZED));

    // Write the raw content to an analysis file
    writeContentAnalysisFile(rawDocument);
   
    // Add the additional fields
    if (additionalFieldMap != null) {
      Iterator iter = additionalFieldMap.keySet().iterator();
      while (iter.hasNext()) {
        String fieldName = (String) iter.next();
        String fieldValue = (String) additionalFieldMap.get(fieldName);
        //doc.add(new Field(fieldName, fieldValue, Field.Store.COMPRESS, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, fieldValue, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, CompressionTools.compressString(fieldValue), Field.Store.YES));
      }
    }

    if (hasContent(cleanedContent)) {
      // Write the clean content to an analysis file
      writeAnalysisFile(url, "clean", cleanedContent);

      // Add the cleaned content of the document
      doc.add(new Field("content", cleanedContent,
        this.storeContentForPreview ? Field.Store.YES : Field.Store.NO, Field.Index.ANALYZED));
    } else {
      // We have no content! This is a substitute document
      // -> Add a "preparation-error"-field
      doc.add(new Field("preparation-error", "true", Field.Store.YES,
          Field.Index.NO));
    }

    // Check whether to use the link text as title
    for (int i = 0; i < mUseLinkTextAsTitleReArr.length; i++) {
      if (mUseLinkTextAsTitleReArr[i].match(url)) {
        String linkText = rawDocument.getSourceLinkText();
        if (linkText != null) {
          title = linkText;
        }
        break;
      }
    }

    // Add the document's title
    if (hasContent(title)) {
      doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("title_sort", title.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
    } else {
      doc.add(new Field("title_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }

    // Add the document's summary
    if (! hasContent(summary) && hasContent(cleanedContent)) {
      summary = createSummaryFromContent(cleanedContent);
    }
    if (hasContent(summary)) {
      doc.add(new Field("summary", summary, Field.Store.NO, Field.Index.ANALYZED));
      doc.add(new Field("summary", CompressionTools.compressString(summary), Field.Store.YES));
    }

   // Add the document's metadata
    if (hasContent(metadata)) {
      doc.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));
    }

    // Add the document's headlines
    if (hasContent(headlines)) {
      doc.add(new Field("headlines", headlines, Field.Store.NO,
          Field.Index.ANALYZED));
    }

    // Add the document's path
    if (pfPair.getPath() != null) {
      //String asString = pathToString(path);
      doc.add(new Field("path", pfPair.getPath(), Field.Store.YES, Field.Index.NO));
      doc.add(new Field("path_sort", pfPair.getPath().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));

      // Write the path to an analysis file
      writeAnalysisFile(url, "path", pfPair.getPath());
    } else {
      doc.add(new Field("path_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }

    return doc;
View Full Code Here

TOP

Related Classes of net.sf.regain.util.io.PathFilenamePair

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.