Package org.archive.crawler.spring

Source Code of org.archive.crawler.spring.SheetOverlaysManager

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.crawler.spring;

import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.spring.OverlayMapsSource;
import org.archive.spring.Sheet;
import org.archive.util.PrefixFinder;
import org.archive.util.SurtPrefixSet;
import org.springframework.beans.BeansException;
import org.springframework.beans.TypeMismatchException;
import org.springframework.beans.factory.BeanFactory;
import org.springframework.beans.factory.BeanFactoryAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.event.ContextRefreshedEvent;

/**
* Manager which marks-up CrawlURIs with the names of all applicable
* Sheets, and returns overlay maps by name.
*
* @contributor gojomo
*/
public class SheetOverlaysManager implements
BeanFactoryAware, OverlayMapsSource, ApplicationListener<ApplicationEvent> {
    private static final Logger logger = Logger.getLogger(SheetOverlaysManager.class.getName());
   

    protected BeanFactory beanFactory;
    /** all SheetAssociations by DecideRule evaluation */
    protected SortedSet<DecideRuledSheetAssociation> ruleAssociations =
        new ConcurrentSkipListSet<DecideRuledSheetAssociation>();
    protected NavigableMap<String,List<String>> sheetNamesBySurt = new ConcurrentSkipListMap<String,List<String>>();
   
    /** all sheets by (bean)name*/
    protected Map<String,Sheet> sheetsByName = new ConcurrentHashMap<String, Sheet>();
   
    public void setBeanFactory(BeanFactory beanFactory) throws BeansException {
        this.beanFactory = beanFactory;
    }
   
    /**
     * Collect all Sheets, by beanName.
     * @param map
     */
    @Autowired(required=false)
    public void setSheetsByName(Map<String,Sheet> map) {
        this.sheetsByName = map;
    }
    /**
     * Sheets, by name; starts with all autowired Sheets but others
     * may be added by other means (mid-crawl reconfiguration).
     * @return map of Sheets by their String name
     */
    public Map<String,Sheet> getSheetsByName() {
        return this.sheetsByName;
    }

    /**
     * All DecideRuledSheetAssociations, in Ordered order
     * 
     * @return set of DecideRuledSheetAssociation
     */
    public SortedSet<DecideRuledSheetAssociation> getRuleAssociations() {
        return this.ruleAssociations;
    }
   
    /**
     * Sheet names, by the SURT prefix to which they should be applied.
     *
     * @return map of Sheet names by their configured SURT
     */
    public NavigableMap<String,List<String>> getSheetsNamesBySurt() {
        return this.sheetNamesBySurt;
    }
    /**
     * Collect all rule-based SheetAssociations. Typically autowired
     * from the set of all DecideRuledSheetAssociation instances.
     * @param ruleSheets
     */
    @Autowired(required=false)
    public void addRuleAssociations(Set<DecideRuledSheetAssociation> associations) {
        // always keep sorted by order
        this.ruleAssociations.clear();
        this.ruleAssociations.addAll(associations);
    }
   
    public void addRuleAssociation(DecideRuledSheetAssociation assoc) {
        this.ruleAssociations.add(assoc);
    }

    /**
     * Collect all SURT-based SheetAssociations. Typically autowired
     * from the set of all SurtPrefixesSheetAssociation instances
     * declared in the initial configuration.
     * @param surtSheets
     */
    @Autowired(required=false)
    public void addSurtAssociations(List<SurtPrefixesSheetAssociation> associations) {
        for(SurtPrefixesSheetAssociation association : associations) {
            addSurtsAssociation(association);
        }
    }
   
    public void addSurtAssociation(String prefix, String sheetName) {
        List<String> sheetNames = sheetNamesBySurt.get(prefix);
        if(sheetNames == null) {
            sheetNames = new LinkedList<String>();
        }
        sheetNames.add(sheetName);
        sheetNamesBySurt.put(prefix, sheetNames);
    }
   
    public boolean removeSurtAssociation(String prefix, String sheetName) {
        List<String> sheetNames = sheetNamesBySurt.get(prefix);
        if(sheetNames == null) {
            // no such association
            return false;
        }
        return sheetNames.remove(sheetName);
    }

    /**
     * Add an individual surtsAssociation to the sheetNamesBySurt map.
     */
    public void addSurtsAssociation(SurtPrefixesSheetAssociation assoc) {
        for(String prefix : assoc.getSurtPrefixes()) {
            for(String s : assoc.getTargetSheetNames()) {
                addSurtAssociation(prefix, s);
            }
        }
    }
   

    /**
     * Retrieve the named overlay Map.
     *
     * @see org.archive.spring.OverlayMapsSource#getOverlayMap(java.lang.String)
     */
    public Map<String, Object> getOverlayMap(String name) {
        Sheet sheet = sheetsByName.get(name);
        if (sheet != null) {
            return sheet.getMap();
        } else {
            return null;
        }
    }

    /**
     * Ensure all sheets are 'primed' after the entire ApplicatiotnContext
     * is assembled. This ensures target HasKeyedProperties beans know
     * any long paths by which their properties are addressed, and
     * handles (by either PropertyEditor-conversion or a fast-failure)
     * any type-mismatches between overlay values and their target
     * properties.
     * @see org.springframework.context.ApplicationListener#onApplicationEvent(org.springframework.context.ApplicationEvent)
     */
    @Override
    public void onApplicationEvent(ApplicationEvent event) {
        if(event instanceof ContextRefreshedEvent) {
            for(Sheet s: sheetsByName.values()) {
                s.prime(); // exception if Sheet can't target overridable properties
            }
            // log warning for any sheets named but not present
            HashSet<String> allSheetNames = new HashSet<String>();
            for(DecideRuledSheetAssociation assoc : ruleAssociations) {
                allSheetNames.addAll(assoc.getTargetSheetNames());
            }
            for(List<String> names : sheetNamesBySurt.values()) {
                allSheetNames.addAll(names);
            }
            for(String name : allSheetNames) {
                if(!sheetsByName.containsKey(name)) {
                    logger.warning("sheet '"+name+"' referenced but absent");
                }
            }
        }
    }
   
    //
    // Convenience methods for during-crawl overlay updates
    //
   
    /**
     * Add to named sheet an overlay of the given bean-path and new value.
     * Creates the sheet if it does not already exist; re-primes the sheet
     * after the change to inform any targeted beans of new external paths.
     *
     * Only if/when the sheet is applied via associations will the overlay
     * have a noticeably effect. Inserting/mutating/priming sheets should
     * only be done in a paused crawl.
     *
     * @param sheetName sheet name to change (or create)
     * @param beanPath target bean-path of overlay
     * @param value new value
     * @return old value, if any
     */
    public Object putSheetOverlay(String sheetName, String beanPath, Object value) {
        Sheet sheet = getOrCreateSheet(sheetName);
        Object prevVal = sheet.getMap().put(beanPath, value);
        try {
            sheet.prime();
        } catch (TypeMismatchException tme) {
            // revert to presumably non-damaging value
            sheet.getMap().put(beanPath, prevVal);
            throw tme;
        }
        return prevVal;
    }
   
    /**
     * Remove the given bean-path overlay in the named sheet.
     *
     * @param sheetName sheet name from which to remove overlay
     * @param beanPath overlay to remove
     * @return previous overlay value, if any
     */
    public Object removeSheetOverlay(String sheetName, String beanPath) {
        Sheet sheet = sheetsByName.get(sheetName);
        if(sheet==null) {
            return null;
        }
        // TODO: do all the externalPaths created by priming need eventual cleanup?
        return sheet.getMap().remove(beanPath);
    }
   
    /**
     * Delete a named sheet from all associations and the master named
     * sheets map.
     * @param sheetName sheet name to delete
     * @return true if any associations/sheet actually deleted
     */
    public boolean deleteSheet(String sheetName) {
        boolean anyDeleted = false;
        // remove as target of any ruled-associations
        for(DecideRuledSheetAssociation assoc : ruleAssociations) {
            anyDeleted |= assoc.getTargetSheetNames().remove(sheetName);
        }
        // remove as target of any surt-associations
        for(List<String> sheetNames : sheetNamesBySurt.values()) {
            anyDeleted |= sheetNames.remove(sheetName);           
        }
        anyDeleted |= (null != sheetsByName.remove(sheetName));
        return anyDeleted;
    }
   
    /**
     * Get a Sheet of the given name, or create if it does not already
     * exist. Provided for convenience of creating Sheet instances after
     * the container has been built.
     *
     * To have effect as an overlay, the returned Sheet must be:
     *
     * (1) filled with overlay entries, where the key is a full bean-path
     * and the value the alternate overlay value;
     * (2) primed via the prime() method, which will throw an exception
     * if the target bean-path does not address a compatible overlayable
     * value;
     * (3) associated to some URIs, by the addSurtAssociation or
     * addRuledAssociation methods
     *
     * @param name Sheet name to create; must be unique
     * @return created Sheet
     */
    public Sheet getOrCreateSheet(String name) {
        Sheet sheet = sheetsByName.get(name);
        if(sheet==null) {
            sheet = new Sheet();
            sheet.setBeanFactory(beanFactory);
            sheet.setName(name);
            sheet.setMap(new HashMap<String, Object>());
            sheetsByName.put(name, sheet);
        }
        return sheet;
    }
   
    /**
     * Apply the proper overlays (by Sheet beanName) to the given CrawlURI,
     * according to configured associations. 
     *
     * TODO: add guard against redundant application more than once?
     * TODO: add mechanism for reapplying overlays after settings change?
     * @param curi
     */
    public void applyOverlaysTo(CrawlURI curi) {
        curi.setOverlayMapsSource(this);
        // apply SURT-based overlays
        curi.getOverlayNames().clear(); // clear previous info
        String effectiveSurt = SurtPrefixSet.getCandidateSurt(curi.getPolicyBasisUURI());
        List<String> foundPrefixes = PrefixFinder.findKeys(sheetNamesBySurt, effectiveSurt);      
        for(String prefix : foundPrefixes) {
            for(String name : sheetNamesBySurt.get(prefix)) {
                curi.getOverlayNames().add(name);
            }
        }
        // apply deciderule-based overlays
        for(DecideRuledSheetAssociation assoc : ruleAssociations) {
            try {
                if(assoc.getRules().accepts(curi)) {
                    curi.getOverlayNames().addAll(assoc.getTargetSheetNames());
                }
            } catch (Exception e) {
                logger.log(Level.SEVERE, "problem determining whether to apply overlays, so not applying " + assoc.getTargetSheetNames() + " to " + curi, e);
            }
        }
        // even if no overlays set, let creation of empty list signal
        // step has occurred -- helps ensure overlays added once-only
        curi.getOverlayNames();
    }
}
TOP

Related Classes of org.archive.crawler.spring.SheetOverlaysManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.