/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.collection;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.util.DomUtil;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.ObjectCache;
import org.apache.xerces.dom.DocumentImpl;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
public class CollectionManager extends Configured {
public static final String DEFAULT_FILE_NAME = "subcollections.xml";
static final Log LOG = LogFactory.getLog(CollectionManager.class);
transient Map collectionMap = new HashMap();
transient URL configfile;
public CollectionManager(Configuration conf) {
super(conf);
init();
}
/**
* Used for testing
*/
protected CollectionManager(){
super(NutchConfiguration.create());
}
protected void init(){
try {
if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); }
// initialize known subcollections
configfile = getConf().getResource(
getConf().get("subcollections.config", DEFAULT_FILE_NAME));
InputStream input = getConf().getConfResourceAsInputStream(
getConf().get("subcollections.config", DEFAULT_FILE_NAME));
parse(input);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Error occured:" + e);
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
}
}
protected void parse(InputStream input) {
Element collections = DomUtil.getDom(input);
if (collections != null) {
NodeList nodeList = collections
.getElementsByTagName(Subcollection.TAG_COLLECTION);
if (LOG.isInfoEnabled()) {
LOG.info("file has" + nodeList.getLength() + " elements");
}
for (int i = 0; i < nodeList.getLength(); i++) {
Element scElem = (Element) nodeList.item(i);
Subcollection subCol = new Subcollection(getConf());
subCol.initialize(scElem);
collectionMap.put(subCol.name, subCol);
}
} else if (LOG.isInfoEnabled()) {
LOG.info("Cannot find collections");
}
}
public static CollectionManager getCollectionManager(Configuration conf) {
String key = "collectionmanager";
ObjectCache objectCache = ObjectCache.get(conf);
CollectionManager impl = (CollectionManager)objectCache.getObject(key);
if (impl == null) {
try {
if (LOG.isInfoEnabled()) {
LOG.info("Instantiating CollectionManager");
}
impl=new CollectionManager(conf);
objectCache.setObject(key,impl);
} catch (Exception e) {
throw new RuntimeException("Couldn't create CollectionManager",e);
}
}
return impl;
}
/**
* Returns named subcollection
*
* @param id
* @return Named SubCollection (or null if not existing)
*/
public Subcollection getSubColection(final String id) {
return (Subcollection) collectionMap.get(id);
}
/**
* Delete named subcollection
*
* @param id
* Id of SubCollection to delete
*/
public void deleteSubCollection(final String id) throws IOException {
final Subcollection subCol = getSubColection(id);
if (subCol != null) {
collectionMap.remove(id);
}
}
/**
* Create a new subcollection.
*
* @param name
* Name of SubCollection to create
* @return Created SubCollection or null if allready existed
*/
public Subcollection createSubCollection(final String id, final String name) {
Subcollection subCol = null;
if (!collectionMap.containsKey(id)) {
subCol = new Subcollection(id, name, getConf());
collectionMap.put(id, subCol);
}
return subCol;
}
/**
* Return names of collections url is part of
*
* @param url
* The url to test against Collections
* @return Space delimited string of collection names url is part of
*/
public String getSubCollections(final String url) {
StringBuilder collections = new StringBuilder();
final Iterator iterator = collectionMap.values().iterator();
while (iterator.hasNext()) {
final Subcollection subCol = (Subcollection) iterator.next();
if (subCol.filter(url) != null) {
if (collections.length() > 0) {
collections.append(' ');
}
collections.append(subCol.name);
}
}
if (LOG.isTraceEnabled()) { LOG.trace("subcollections:" + collections); }
return collections.toString();
}
/**
* Returns all collections
*
* @return All collections CollectionManager knows about
*/
public Collection getAll() {
return collectionMap.values();
}
/**
* Save collections into file
*
* @throws Exception
*/
public void save() throws IOException {
try {
final FileOutputStream fos = new FileOutputStream(new File(configfile
.getFile()));
final Document doc = new DocumentImpl();
final Element collections = doc
.createElement(Subcollection.TAG_COLLECTIONS);
final Iterator iterator = collectionMap.values().iterator();
while (iterator.hasNext()) {
final Subcollection subCol = (Subcollection) iterator.next();
final Element collection = doc
.createElement(Subcollection.TAG_COLLECTION);
collections.appendChild(collection);
final Element name = doc.createElement(Subcollection.TAG_NAME);
name.setNodeValue(subCol.getName());
collection.appendChild(name);
final Element whiteList = doc
.createElement(Subcollection.TAG_WHITELIST);
whiteList.setNodeValue(subCol.getWhiteListString());
collection.appendChild(whiteList);
final Element blackList = doc
.createElement(Subcollection.TAG_BLACKLIST);
blackList.setNodeValue(subCol.getBlackListString());
collection.appendChild(blackList);
}
DomUtil.saveDom(fos, collections);
fos.flush();
fos.close();
} catch (FileNotFoundException e) {
throw new IOException(e.toString());
}
}
}