// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2007 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: LinkResolver.java,v 1.13 2007/11/02 12:32:20 spyromus Exp $
//
package com.salas.bb.whatshot;
import EDU.oswego.cs.dl.util.concurrent.Executor;
import com.salas.bb.utils.StringUtils;
import com.salas.bb.utils.concurrency.ExecutorFactory;
import com.salas.bb.utils.concurrency.NamingThreadFactory;
import org.apache.commons.collections.ReferenceMap;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Pattern;
/**
* Link resolver contacts servers to learn titles of the links it's given.
*
*/
public class LinkResolver
{
/** Default scan limit -- number of bytes to read from the stream looking for the title tag before giving up. */
private static final int DEFAULT_SCAN_LIMIT = 2000;
/** The map of patterns to replacement strings for the post-processing of titles. */
private static Map<Pattern, String> postProcessingInstructions = new LinkedHashMap<Pattern, String>();
/**
* The map of patterns to the scan limits. The default limit is 2K which means that 2Kb of a resource
* will be fetched (max) to find the TITLE tag. You can adjust it with this map.
*/
private static Map<Pattern, Integer> customScanLimits = new LinkedHashMap<Pattern, Integer>();
/** The list of custom special link resolvers. */
private static List<ICustomLinkResolver> customLinkResolvers = new LinkedList<ICustomLinkResolver>();
/** Cache of resolutions. It's memory-sensitive. */
private static final ReferenceMap CACHE = new ReferenceMap();
/**
* Resolution executor.
*/
private Executor executor;
/**
* Listener for the resolution events.
*/
private final ILinkResolverListener listener;
static {
addPostProcessingInstruction(Pattern.compile("^([^:]+):\\s+(.+)\\s+on\\s+technorati", Pattern.CASE_INSENSITIVE),
"Technorati tag: $1 ($2)");
addCustomScanLimits(Pattern.compile("^http://(www\\.)?amazon\\.com", Pattern.CASE_INSENSITIVE), 20000);
}
/**
* Creates a link resolver for a given listener.
*
* @param listener listner.
*
* @throws IllegalArgumentException if listener is <code>NULL</code>.
*/
public LinkResolver(ILinkResolverListener listener)
{
if (listener == null) throw new IllegalArgumentException("Listener can't be NULL");
this.listener = listener;
executor = ExecutorFactory.createPooledExecutor(new NamingThreadFactory("Link Resolver", Thread.MIN_PRIORITY),
5, 1000);
}
/**
* Stops link resolution immediately.
*/
public void stop()
{
// Shutdown immediately and don't care about the unprocessed results
executor = null;
}
/**
* Returns the title of the link in the group or, schedules the
* resolution and returns the link text.
*
* @param group group to resolveURI link for.
*
* @return resolved text or link itself.
*/
public synchronized String resolve(HotResultGroup group)
{
// Check local cache
String title = getFromCache(group);
// Schedule the task if not in the cache
if (title == null)
{
title = group.getName();
try
{
executor.execute(new ResolutionTask(group));
} catch (InterruptedException e)
{
// Failed to schedule
e.printStackTrace();
}
}
return title;
}
/**
* Checks if the title for this group is in the cache.
*
* @param group group.
*
* @return title.
*/
private String getFromCache(HotResultGroup group)
{
return (String)CACHE.get(group.getLink().toString());
}
/**
* Performs the post-processing of the title resolved.
*
* @param title title.
*
* @return processed title.
*/
static String postprocessTitle(String title)
{
if (title == null || StringUtils.isEmpty(title)) return title;
for (Map.Entry<Pattern, String> entry : postProcessingInstructions.entrySet())
{
title = entry.getKey().matcher(title).replaceAll(entry.getValue());
}
return title;
}
/**
* Removes all instructions.
*/
static void clearPostProcessingInstructions()
{
postProcessingInstructions.clear();
}
/**
* Adds a post-processing instruction to the tail of the instructions list.
*
* @param matchPattern pattern to match in the title.
* @param replacement replacement to make.
*/
public static void addPostProcessingInstruction(Pattern matchPattern, String replacement)
{
postProcessingInstructions.put(matchPattern, replacement);
}
/**
* Checks if a given URL requires some special treatment.
*
* @param url link to check.
*
* @return title or <code>NULL</code> if to follow usual procedures.
*/
static String customLinkResolution(URL url)
{
if (url == null) return null;
String title = null;
for (ICustomLinkResolver resolver : customLinkResolvers)
{
title = resolver.resolve(url);
if (title != null) break;
}
return title;
}
/**
* Clears the list of custom link resolvers.
*/
static void clearCustomLinkResolvers()
{
customLinkResolvers.clear();
}
/**
* Adds a custom link resolver to the end of the list.
*
* @param resolver resolver.
*/
public static void addCustomLinkResolver(ICustomLinkResolver resolver)
{
customLinkResolvers.add(resolver);
}
/**
* Adds new pattern for the URL recognition and the limit for the TITLE tag scanning procedure.
*
* @param pattern parrent.
* @param limit scan limit in bytes.
*/
public static void addCustomScanLimits(Pattern pattern, int limit)
{
customScanLimits.put(pattern, limit);
}
/**
* Returns a scan limit for a link.
*
* @param link link.
*
* @return limit.
*/
private static int getScanLimit(URL link)
{
int limit = DEFAULT_SCAN_LIMIT;
if (link != null)
{
String ls = link.toString();
Set<Map.Entry<Pattern,Integer>> entries = customScanLimits.entrySet();
for (Map.Entry<Pattern, Integer> entry : entries)
{
if (entry.getKey().matcher(ls).find()) return entry.getValue();
}
}
return limit;
}
/**
* Fetches the title from the stream until finds the '<' or
* the end.
*
* @param is input stream.
*
* @return title.
*
* @throws IOException in case of I/O error.
*/
String fetchTitle(InputStream is) throws IOException
{
int ch;
// Found the title tag and the text
ByteArrayOutputStream buf = new ByteArrayOutputStream();
while ((ch = is.read()) != -1 && ch != '<')
{
buf.write(ch);
if (isTerminated()) return null;
}
return buf.toString().trim();
}
/**
* Returns <code>TRUE</code> if executor is no longer workin.
*
* @return <code>TRUE</code> if terminated.
*/
private boolean isTerminated()
{
return executor == null;
}
/**
* Resolves a single link into the title.
*/
private class ResolutionTask implements Runnable
{
private final HotResultGroup group;
private String tag = "<title>";
private int pos = 0;
/**
* Creates a resolver task.
*
* @param group group to resolveURI the title for.
*/
public ResolutionTask(HotResultGroup group)
{
this.group = group;
}
/**
* Main task method.
*/
public void run()
{
if (isTerminated()) return;
try
{
if (resolve(group)) listener.onGroupResolved(group);
} catch (IOException e)
{
// Fall through
}
}
/**
* Invoked to resolveURI the hotlink into the title for this group item.
*
* @param group group to resolveURI.
*
* @return <code>TRUE</code> if the title was resolved and changed.
*
* @throws IOException in case of any I/O errors.
*/
private boolean resolve(HotResultGroup group)
throws IOException
{
String title;
// Don't resolveURI invisible groups
// When they become visible, they will be resolved
if (!group.isVisible()) return false;
// Check if the link needs some special treatment.
URL url = group.getLink();
title = customLinkResolution(url);
if (title == null)
{
URLConnection con = url.openConnection();
String contentType = con.getContentType();
InputStream is = null;
int max = getScanLimit(url);
try
{
// Content type
if (contentType != null && contentType.startsWith("text/html"))
{
is = new BufferedInputStream(con.getInputStream());
title = resolveFromStream(is, max);
}
// Sets the title of the page
if (title != null) title = StringUtils.unescape(title);
} finally
{
if (is != null) is.close();
}
}
// Process the title to replace some parts or do any other post-processing
if (title != null) title = postprocessTitle(title);
if (StringUtils.isEmpty(title)) title = "[Unresolved] " + url.toString();
// Remember the resolution in the cache
CACHE.put(url.toString(), title);
group.setResolvedTitle(title);
return true;
}
/**
* Resolves a title from stream.
*
* @param is stream.
* @param max maximum characters to load.
*
* @return title or <code>NULL</code> if not found.
*
* @throws IOException if I/O error happens.
*/
String resolveFromStream(InputStream is, int max) throws IOException
{
String title = null;
int i = 0;
int b;
while (title == null && !isTerminated() && i++ < max && (b = is.read()) != -1) title = resolveChar(b, is);
return title;
}
/**
* Resolves a character and moves on. Returns a title if recognized.
*
* @param b byte from the stream.
* @param is input stream.
*
* @return title.
*
* @throws IOException if I/O exception happens.
*/
String resolveChar(int b, InputStream is)
throws IOException
{
if (pos < tag.length())
{
// Skip whitespace
if (b == ' ' || b == '\n' || b == '\r' || b == '\t') return null;
// Lowercase (but not < or >)
if (b != '<' && b != '>' && b < 'a') b += ' ';
// Check against the pattern
char ch = tag.charAt(pos);
if (ch != b) pos = 0;
if (ch == b) pos++;
}
return (pos == tag.length()) ? fetchTitle(is) : null;
}
}
}