Package org.apache.nutch.collection

Source Code of org.apache.nutch.collection.CollectionManager

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.collection;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.util.DomUtil;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.ObjectCache;
import org.apache.xerces.dom.DocumentImpl;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

public class CollectionManager extends Configured {

  public static final String DEFAULT_FILE_NAME = "subcollections.xml";

  static final Log LOG = LogFactory.getLog(CollectionManager.class);

  transient Map collectionMap = new HashMap();

  transient URL configfile;
 
  public CollectionManager(Configuration conf) {
    super(conf);
    init();
  }
 
  /**
   * Used for testing
   */
  protected CollectionManager(){
    super(NutchConfiguration.create());
  }

  protected void init(){
    try {
      if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); }
      // initialize known subcollections
      configfile = getConf().getResource(
          getConf().get("subcollections.config", DEFAULT_FILE_NAME));

      InputStream input = getConf().getConfResourceAsInputStream(
          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
      parse(input);
    } catch (Exception e) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Error occured:" + e);
        e.printStackTrace(LogUtil.getWarnStream(LOG));
      }
    }
  }

  protected void parse(InputStream input) {
    Element collections = DomUtil.getDom(input);

    if (collections != null) {
      NodeList nodeList = collections
          .getElementsByTagName(Subcollection.TAG_COLLECTION);

      if (LOG.isInfoEnabled()) {
        LOG.info("file has" + nodeList.getLength() + " elements");
      }
     
      for (int i = 0; i < nodeList.getLength(); i++) {
        Element scElem = (Element) nodeList.item(i);
        Subcollection subCol = new Subcollection(getConf());
        subCol.initialize(scElem);
        collectionMap.put(subCol.name, subCol);
      }
    } else if (LOG.isInfoEnabled()) {
      LOG.info("Cannot find collections");
    }
  }
 
  public static CollectionManager getCollectionManager(Configuration conf) {
    String key = "collectionmanager";
    ObjectCache objectCache = ObjectCache.get(conf);
    CollectionManager impl = (CollectionManager)objectCache.getObject(key);
    if (impl == null) {
      try {
        if (LOG.isInfoEnabled()) {
          LOG.info("Instantiating CollectionManager");
        }
        impl=new CollectionManager(conf);
        objectCache.setObject(key,impl);
      } catch (Exception e) {
        throw new RuntimeException("Couldn't create CollectionManager",e);
      }
    }
    return impl;
  }

  /**
   * Returns named subcollection
   *
   * @param id
   * @return Named SubCollection (or null if not existing)
   */
  public Subcollection getSubColection(final String id) {
    return (Subcollection) collectionMap.get(id);
  }

  /**
   * Delete named subcollection
   *
   * @param id
   *          Id of SubCollection to delete
   */
  public void deleteSubCollection(final String id) throws IOException {
    final Subcollection subCol = getSubColection(id);
    if (subCol != null) {
      collectionMap.remove(id);
    }
  }

  /**
   * Create a new subcollection.
   *
   * @param name
   *          Name of SubCollection to create
   * @return Created SubCollection or null if allready existed
   */
  public Subcollection createSubCollection(final String id, final String name) {
    Subcollection subCol = null;

    if (!collectionMap.containsKey(id)) {
      subCol = new Subcollection(id, name, getConf());
      collectionMap.put(id, subCol);
    }

    return subCol;
  }

  /**
   * Return names of collections url is part of
   *
   * @param url
   *          The url to test against Collections
   * @return Space delimited string of collection names url is part of
   */
  public String getSubCollections(final String url) {
    StringBuilder collections = new StringBuilder();
    final Iterator iterator = collectionMap.values().iterator();

    while (iterator.hasNext()) {
      final Subcollection subCol = (Subcollection) iterator.next();
      if (subCol.filter(url) != null) {
        if (collections.length() > 0) {
          collections.append(' ');
        }
        collections.append(subCol.name);
      }
    }
    if (LOG.isTraceEnabled()) { LOG.trace("subcollections:" + collections); }
   
    return collections.toString();
  }

  /**
   * Returns all collections
   *
   * @return All collections CollectionManager knows about
   */
  public Collection getAll() {
    return collectionMap.values();
  }

  /**
   * Save collections into file
   *
   * @throws Exception
   */
  public void save() throws IOException {
    try {
      final FileOutputStream fos = new FileOutputStream(new File(configfile
          .getFile()));
      final Document doc = new DocumentImpl();
      final Element collections = doc
          .createElement(Subcollection.TAG_COLLECTIONS);
      final Iterator iterator = collectionMap.values().iterator();

      while (iterator.hasNext()) {
        final Subcollection subCol = (Subcollection) iterator.next();
        final Element collection = doc
            .createElement(Subcollection.TAG_COLLECTION);
        collections.appendChild(collection);
        final Element name = doc.createElement(Subcollection.TAG_NAME);
        name.setNodeValue(subCol.getName());
        collection.appendChild(name);
        final Element whiteList = doc
            .createElement(Subcollection.TAG_WHITELIST);
        whiteList.setNodeValue(subCol.getWhiteListString());
        collection.appendChild(whiteList);
        final Element blackList = doc
            .createElement(Subcollection.TAG_BLACKLIST);
        blackList.setNodeValue(subCol.getBlackListString());
        collection.appendChild(blackList);
      }

      DomUtil.saveDom(fos, collections);
      fos.flush();
      fos.close();
    } catch (FileNotFoundException e) {
      throw new IOException(e.toString());
    }
  }
}
TOP

Related Classes of org.apache.nutch.collection.CollectionManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.