Package org.carrot2.source.opensearch

Source Code of org.carrot2.source.opensearch.RomeFetcherUtils

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.source.opensearch;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.StringEscapeUtils;
import org.carrot2.core.Document;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.util.StringUtils;

import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.fetcher.FeedFetcher;
import com.sun.syndication.fetcher.FetcherException;
import com.sun.syndication.io.FeedException;

/**
* Utility methods for working with Rome fetcher.
*/
public class RomeFetcherUtils
{
    /**
     * Fetches an OpenSearch feed from the provided URL and returns the entries as Carrot2
     * {@link SearchEngineResponse}.
     *
     * @param url the OpenSearch feed to fetch
     * @param feedFetcher Rome fetcher to use
     * @return {@link SearchEngineResponse} containing entries from the feed
     */
    @SuppressWarnings("rawtypes")
    public static SearchEngineResponse fetchUrl(final String url, FeedFetcher feedFetcher)
        throws IOException, FeedException, FetcherException, MalformedURLException
    {
        /*
         * TODO: Rome fetcher uses SUN's HttpClient and opens a persistent HTTP connection
         * (background thread that keeps reference to the class loader). This causes minor
         * memory leaks when reloading Web applications. Consider: 1) patching rome
         * fetcher sources and adding Connection: close to request headers, 2) using
         * Apache HttpClient, 3) using manual fetch of the syndication feed.
         */
        final SyndFeed feed = feedFetcher.retrieveFeed(new URL(url));
        final SearchEngineResponse response = new SearchEngineResponse();

        // The documentation does not mention that null value can be returned
        // but we've seen a NPE here:
        // http://builds.carrot2.org/browse/C2HEAD-SOURCES-4.
        if (feed != null)
        {
            final List entries = feed.getEntries();
            for (Iterator it = entries.iterator(); it.hasNext();)
            {
                final SyndEntry entry = (SyndEntry) it.next();
                final Document document = new Document();

                document.setField(Document.TITLE, clean(entry.getTitle()));
                document.setField(Document.SUMMARY, clean(entry.getDescription()
                    .getValue()));
                document.setField(Document.CONTENT_URL, entry.getLink());

                response.results.add(document);
            }
        }

        return response;
    }

    private static String clean(String string)
    {
        return StringUtils.removeHtmlTags(StringEscapeUtils.unescapeHtml(string));
    }
}
TOP

Related Classes of org.carrot2.source.opensearch.RomeFetcherUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.