Package org.apache.nutch.protocol

Source Code of org.apache.nutch.protocol.Content

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.protocol;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayFile;
import org.apache.hadoop.io.CompressedWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
import org.apache.nutch.util.mime.MimeTypes;

public final class Content extends CompressedWritable {

  public static final String DIR_NAME = "content";

  private final static byte VERSION = 2;

  private byte version;

  private String url;

  private String base;

  private byte[] content;

  private String contentType;

  private Metadata metadata;

  private boolean mimeTypeMagic;

  private MimeTypes mimeTypes;

  private boolean inflated;

  public Content() {
    inflated = false;
  }

  public Content(String url, String base, byte[] content, String contentType,
      Metadata metadata, Configuration conf) {

    if (url == null)
      throw new IllegalArgumentException("null url");
    if (base == null)
      throw new IllegalArgumentException("null base");
    if (content == null)
      throw new IllegalArgumentException("null content");
    if (metadata == null)
      throw new IllegalArgumentException("null metadata");

    this.url = url;
    this.base = base;
    this.content = content;
    this.metadata = metadata;
    this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
    this.mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
    this.contentType = getContentType(contentType, url, content);
    inflated = true;
  }

  public void ensureInflated() {
    if (inflated) {
      return;
    }
    super.ensureInflated();
    inflated = true;
  }

  protected final void readFieldsCompressed(DataInput in) throws IOException {
    version = in.readByte();
    metadata = new Metadata();
    switch (version) {
    case 0:
    case 1:
      url = UTF8.readString(in); // read url 
      base = UTF8.readString(in); // read base

      content = new byte[in.readInt()]; // read content
      in.readFully(content);

      contentType = UTF8.readString(in); // read contentType
      // reconstruct metadata
      int keySize = in.readInt();
      String key;
      for (int i = 0; i < keySize; i++) {
        key = UTF8.readString(in);
        int valueSize = in.readInt();
        for (int j = 0; j < valueSize; j++) {
          metadata.add(key, UTF8.readString(in));
        }
      }
      break;
    case VERSION:
      url = Text.readString(in); // read url
      base = Text.readString(in); // read base

      content = new byte[in.readInt()]; // read content
      in.readFully(content);

      contentType = Text.readString(in); // read contentType
      metadata.readFields(in); // read meta data
      break;
    default:
      throw new VersionMismatchException(VERSION, version);
    }

  }

  protected final void writeCompressed(DataOutput out) throws IOException {
    out.writeByte(VERSION);

    Text.writeString(out, url); // write url
    Text.writeString(out, base); // write base

    out.writeInt(content.length); // write content
    out.write(content);

    Text.writeString(out, contentType); // write contentType

    metadata.write(out); // write metadata
  }

  public static Content read(DataInput in) throws IOException {
    Content content = new Content();
    content.readFields(in);
    return content;
  }

  //
  // Accessor methods
  //

  /** The url fetched. */
  public String getUrl() {
    ensureInflated();
    return url;
  }

  /** The base url for relative links contained in the content.
   * Maybe be different from url if the request redirected.
   */
  public String getBaseUrl() {
    ensureInflated();
    return base;
  }

  /** The binary content retrieved. */
  public byte[] getContent() {
    ensureInflated();
    return content;
  }

  public void setContent(byte[] content) {
    ensureInflated();
    this.content = content;
  }

  /** The media type of the retrieved content.
   * @see <a href="http://www.iana.org/assignments/media-types/">
   *      http://www.iana.org/assignments/media-types/</a>
   */
  public String getContentType() {
    ensureInflated();
    return contentType;
  }

  public void setContentType(String contentType) {
    ensureInflated();
    this.contentType = contentType;
  }

  /** Other protocol-specific data. */
  public Metadata getMetadata() {
    ensureInflated();
    return metadata;
  }

  /** Other protocol-specific data. */
  public void setMetadata(Metadata metadata) {
    ensureInflated();
    this.metadata = metadata;
  }

  public boolean equals(Object o) {
    ensureInflated();
    if (!(o instanceof Content)) {
      return false;
    }
    Content that = (Content) o;
    that.ensureInflated();
    return this.url.equals(that.url) && this.base.equals(that.base)
        && Arrays.equals(this.getContent(), that.getContent())
        && this.contentType.equals(that.contentType)
        && this.metadata.equals(that.metadata);
  }

  public String toString() {
    ensureInflated();
    StringBuffer buffer = new StringBuffer();

    buffer.append("Version: " + version + "\n");
    buffer.append("url: " + url + "\n");
    buffer.append("base: " + base + "\n");
    buffer.append("contentType: " + contentType + "\n");
    buffer.append("metadata: " + metadata + "\n");
    buffer.append("Content:\n");
    buffer.append(new String(content)); // try default encoding

    return buffer.toString();

  }

  public static void main(String argv[]) throws Exception {

    String usage = "Content (-local | -dfs <namenode:port>) recno segment";

    if (argv.length < 3) {
      System.out.println("usage:" + usage);
      return;
    }
    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.parseArgs(argv, 0, conf);
    try {
      int recno = Integer.parseInt(argv[0]);
      String segment = argv[1];

      Path file = new Path(segment, DIR_NAME);
      System.out.println("Reading from file: " + file);

      ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
          conf);

      Content content = new Content();
      contents.get(recno, content);
      System.out.println("Retrieved " + recno + " from file " + file);

      System.out.println(content);

      contents.close();
    } finally {
      fs.close();
    }
  }

  private String getContentType(String typeName, String url, byte[] data) {
    MimeType type = null;
    try {
      typeName = MimeType.clean(typeName);
      type = typeName == null ? null : this.mimeTypes.forName(typeName);
    } catch (MimeTypeException mte) {
      // Seems to be a malformed mime type name...
    }

    if (typeName == null || type == null || !type.matches(url)) {
      // If no mime-type header, or cannot find a corresponding registered
      // mime-type, or the one found doesn't match the url pattern
      // it shouldbe, then guess a mime-type from the url pattern
      type = this.mimeTypes.getMimeType(url);
      typeName = type == null ? typeName : type.getName();
    }
    if (typeName == null || type == null
        || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
      // If no mime-type already found, or the one found doesn't match
      // the magic bytes it should be, then, guess a mime-type from the
      // document content (magic bytes)
      type = this.mimeTypes.getMimeType(data);
      typeName = type == null ? typeName : type.getName();
    }
    return typeName;
  }

  /**
   * By calling this method caller forces the next access to any property (via
   * getters and setters) to check if decompressing of data is really required.
   */
  public void forceInflate() {
    inflated = false;
  }

}
TOP

Related Classes of org.apache.nutch.protocol.Content

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.