Source Code of org.elasticsearch.river.rss.RssRiver$RSSParser

/*
 * Licensed to David Pilato (the "Author") under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Author licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.elasticsearch.river.rss;


import com.rometools.rome.feed.rss.Channel;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import com.rometools.rome.io.XmlReader;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse;
import org.elasticsearch.action.bulk.BulkItemResponse;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlockException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MappingMetaData;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.joda.time.format.ISODateTimeFormat;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.indices.IndexAlreadyExistsException;
import org.elasticsearch.river.AbstractRiverComponent;
import org.elasticsearch.river.River;
import org.elasticsearch.river.RiverName;
import org.elasticsearch.river.RiverSettings;


import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.Map;
import java.util.UUID;


import static org.elasticsearch.client.Requests.indexRequest;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.river.rss.RssToJson.toJson;


/**
 * @author dadoonet (David Pilato)
 */
public class RssRiver extends AbstractRiverComponent implements River {


  private final Client client;


  private final String indexName;


  private final String typeName;
    private final Boolean raw;


    private final int bulkSize;
    private final int maxConcurrentBulk;
    private final TimeValue bulkFlushInterval;


    private volatile BulkProcessor bulkProcessor;


    private volatile ArrayList<Thread> threads;


  private volatile boolean closed = false;


  private final ArrayList<RssRiverFeedDefinition> feedsDefinition;


  @SuppressWarnings({ "unchecked" })
  @Inject
  public RssRiver(RiverName riverName, RiverSettings settings, Client client)
      throws MalformedURLException {
    super(riverName, settings);
    this.client = client;
    if (settings.settings().containsKey("rss")) {
      Map<String, Object> rssSettings = (Map<String, Object>) settings.settings().get("rss");
      
      // Getting feeds array
      boolean array = XContentMapValues.isArray(rssSettings.get("feeds"));
      if (array) {
        ArrayList<Map<String, Object>> feeds = (ArrayList<Map<String, Object>>) rssSettings.get("feeds");
        feedsDefinition = new ArrayList<RssRiverFeedDefinition>(feeds.size());
        for (Map<String, Object> feed : feeds) {
          String feedname = XContentMapValues.nodeStringValue(feed.get("name"), null);
          String url = XContentMapValues.nodeStringValue(feed.get("url"), null);
                    TimeValue updateRate = TimeValue.parseTimeValue(XContentMapValues.nodeStringValue(
                            feed.get("update_rate"), null), TimeValue.timeValueMinutes(15));
                    boolean ignoreTtl = XContentMapValues.nodeBooleanValue(feed.get("ignore_ttl"), false);
          feedsDefinition.add(new RssRiverFeedDefinition(feedname, url, updateRate, ignoreTtl));
        }
        
      } else {
        logger.warn("rss.url and rss.update_rate have been deprecated. Use rss.feeds[].url and rss.feeds[].update_rate instead.");
        logger.warn("See https://github.com/dadoonet/rssriver/issues/6 for more details...");
        String url = XContentMapValues.nodeStringValue(rssSettings.get("url"), null);
                TimeValue updateRate  = TimeValue.parseTimeValue(XContentMapValues.nodeStringValue(
                        rssSettings.get("update_rate"), null), TimeValue.timeValueMinutes(15));
                boolean ignoreTtl = XContentMapValues.nodeBooleanValue("ignore_ttl", false);


        feedsDefinition = new ArrayList<RssRiverFeedDefinition>(1);
        feedsDefinition.add(new RssRiverFeedDefinition(null, url, updateRate, ignoreTtl));
      }
            raw = XContentMapValues.nodeBooleanValue(rssSettings.get("raw"), true);


        } else {
      String url = "http://www.lemonde.fr/rss/une.xml";
      logger.warn("You didn't define the rss url. Switching to defaults : [{}]", url);
      feedsDefinition = new ArrayList<RssRiverFeedDefinition>(1);
      feedsDefinition.add(new RssRiverFeedDefinition("lemonde", url, TimeValue.timeValueMinutes(15), false));
            raw = true;
    }


    
    if (settings.settings().containsKey("index")) {
      Map<String, Object> indexSettings = (Map<String, Object>) settings
          .settings().get("index");
      indexName = XContentMapValues.nodeStringValue(
          indexSettings.get("index"), riverName.name());
      typeName = XContentMapValues.nodeStringValue(
          indexSettings.get("type"), "page");
            bulkSize = XContentMapValues.nodeIntegerValue(
                    indexSettings.get("bulk_size"), 25);
            bulkFlushInterval = TimeValue.parseTimeValue(XContentMapValues.nodeStringValue(
                    indexSettings.get("flush_interval"), null), TimeValue.timeValueSeconds(5));
            maxConcurrentBulk = XContentMapValues.nodeIntegerValue(indexSettings.get("max_concurrent_bulk"), 1);
        } else {
      indexName = riverName.name();
      typeName = "page";
            bulkSize = 100;
            maxConcurrentBulk = 1;
            bulkFlushInterval = TimeValue.timeValueSeconds(5);
        }
  }


  @Override
  public void start() {
    if (logger.isInfoEnabled()) logger.info("Starting rss stream");
    try {
      client.admin().indices().prepareCreate(indexName).execute()
          .actionGet();
    } catch (Exception e) {
      if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException) {
        // that's fine
      } else if (ExceptionsHelper.unwrapCause(e) instanceof ClusterBlockException) {
        // ok, not recovered yet..., lets start indexing and hope we
        // recover by the first bulk
        // TODO: a smarter logic can be to register for cluster event
        // listener here, and only start sampling when the block is
        // removed...
      } else {
        logger.warn("failed to create index [{}], disabling river...",
            e, indexName);
        return;
      }
    }


        try {
            pushMapping(indexName, typeName, RssToJson.buildRssMapping(typeName, raw));
        } catch (Exception e) {
            logger.warn("failed to create mapping for [{}/{}], disabling river...",
                    e, indexName, typeName);
            return;
        }


        // Creating bulk processor
        this.bulkProcessor = BulkProcessor.builder(client, new BulkProcessor.Listener() {
            @Override
            public void beforeBulk(long executionId, BulkRequest request) {
                logger.debug("Going to execute new bulk composed of {} actions", request.numberOfActions());
            }


            @Override
            public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
                logger.debug("Executed bulk composed of {} actions", request.numberOfActions());
                if (response.hasFailures()) {
                    logger.warn("There was failures while executing bulk", response.buildFailureMessage());
                    if (logger.isDebugEnabled()) {
                        for (BulkItemResponse item : response.getItems()) {
                            if (item.isFailed()) {
                                logger.debug("Error for {}/{}/{} for {} operation: {}", item.getIndex(),
                                        item.getType(), item.getId(), item.getOpType(), item.getFailureMessage());
                            }
                        }
                    }
                }
            }


            @Override
            public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
                logger.warn("Error executing bulk", failure);
            }
        })
                .setBulkActions(bulkSize)
                .setConcurrentRequests(maxConcurrentBulk)
                .setFlushInterval(bulkFlushInterval)
                .build();


        // We create as many Threads as there are feeds
    threads = new ArrayList<Thread>(feedsDefinition.size());
    int threadNumber = 0;
    for (RssRiverFeedDefinition feedDefinition : feedsDefinition) {
      Thread thread = EsExecutors.daemonThreadFactory(settings.globalSettings(), "rss_slurper_" + threadNumber)
                    .newThread(new RSSParser(feedDefinition));
      thread.start();
      threads.add(thread);
      threadNumber++;
    }
  }


  @Override
  public void close() {
    if (logger.isInfoEnabled()) logger.info("Closing rss river");
    closed = true;


        bulkProcessor.close();


    // We have to close each Thread
    if (threads != null) {
      for (Thread thread : threads) {
        if (thread != null) {
          thread.interrupt();
        }
      }
    }
  }


  
  private SyndFeed getFeed(String url) {
    try {
      URL feedUrl = new URL(url);
      URLConnection openConnection = feedUrl.openConnection();
          openConnection.addRequestProperty("User-Agent", "RSS River for Elasticsearch (https://github.com/dadoonet/rssriver)"); 
      SyndFeedInput input = new SyndFeedInput();
            input.setPreserveWireFeed(true);
      SyndFeed feed = input.build(new XmlReader(openConnection));
      return feed;
    } catch (MalformedURLException e) {
      logger.error("RSS Url is incorrect : [{}].", url);
    } catch (IllegalArgumentException e) {
      logger.error("Feed from [{}] is incorrect.", url);
    } catch (FeedException e) {
      logger.error("Can not parse feed from [{}].", url);
    } catch (IOException e) {
      logger.error("Can not read feed from [{}].", url);
    }
    
    return null;
  }


    /**
     * Check if a mapping already exists in an index
     * @param index Index name
     * @param type Mapping name
     * @return true if mapping exists
     */
    private boolean isMappingExist(String index, String type) {
        ClusterState cs = client.admin().cluster().prepareState().setIndices(index).execute().actionGet().getState();
        IndexMetaData imd = cs.getMetaData().index(index);


        if (imd == null) return false;


        MappingMetaData mdd = imd.mapping(type);


        if (mdd != null) return true;
        return false;
    }


    private void pushMapping(String index, String type, XContentBuilder xcontent) throws Exception {
        if (logger.isTraceEnabled()) logger.trace("pushMapping("+index+","+type+")");


        // If type does not exist, we create it
        boolean mappingExist = isMappingExist(index, type);
        if (!mappingExist) {
            logger.debug("Mapping ["+index+"]/["+type+"] doesn't exist. Creating it.");


            // Read the mapping json file if exists and use it
            if (xcontent != null) {
                if (logger.isTraceEnabled()) logger.trace("Mapping for ["+index+"]/["+type+"]="+xcontent.string());
                // Create type and mapping
                PutMappingResponse response = client.admin().indices()
                        .preparePutMapping(index)
                        .setType(type)
                        .setSource(xcontent)
                        .execute().actionGet();
                if (!response.isAcknowledged()) {
                    throw new Exception("Could not define mapping for type ["+index+"]/["+type+"].");
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Mapping definition for ["+index+"]/["+type+"] succesfully created.");
                    }
                }
            } else {
                if (logger.isDebugEnabled()) logger.debug("No mapping definition for ["+index+"]/["+type+"]. Ignoring.");
            }
        } else {
            if (logger.isDebugEnabled()) logger.debug("Mapping ["+index+"]/["+type+"] already exists.");
        }
        if (logger.isTraceEnabled()) logger.trace("/pushMapping("+index+","+type+")");
    }


  private class RSSParser implements Runnable {
    private String url;
    private TimeValue updateRate;
    private String feedname;
        private boolean ignoreTtl;


        public RSSParser(String feedname, String url, TimeValue updateRate, boolean ignoreTtl) {
      this.feedname = feedname;
      this.url = url;
      this.updateRate = updateRate;
            this.ignoreTtl = ignoreTtl;
            if (logger.isInfoEnabled()) logger.info("creating rss stream river [{}] for [{}] every [{}] ms",
                    feedname, url, updateRate);
    }


        public RSSParser(RssRiverFeedDefinition feedDefinition) {
            this(feedDefinition.getFeedname(),
                    feedDefinition.getUrl(),
                    feedDefinition.getUpdateRate(),
                    feedDefinition.isIgnoreTtl());
        }


        @SuppressWarnings("unchecked")
    @Override
    public void run() {
            while (true) {
        if (closed) {
          return;
        }
        
        // Let's call the Rss flow
        SyndFeed feed = getFeed(url);
                if (feed != null) {
                    if (logger.isDebugEnabled()) logger.debug("Reading feed from {}", url);
                    Date feedDate = feed.getPublishedDate();
                    if (logger.isDebugEnabled()) logger.debug("Feed publish date is {}", feedDate);


                    String lastupdateField = "_lastupdated_" + UUID.nameUUIDFromBytes(url.getBytes()).toString();
                    Date lastDate = getLastDateFromRiver(lastupdateField);
                    // Comparing dates to see if we have something to do or not
                    if (lastDate == null || (feedDate != null && feedDate.after(lastDate))) {
                        // We have to send results to ES
                        if (logger.isTraceEnabled()) logger.trace("Feed is updated : {}", feed);


                        try {
                            // We have now to send each feed to ES
                            Date mostRecentItemDate = null;
                            for (SyndEntry message : (Iterable<SyndEntry>) feed.getEntries()) {
                                // We don't have a global date, so let's see if we have one in items
                                if (feedDate == null) {
                                    if (message.getUpdatedDate() != null) {
                                        if (lastDate == null || message.getUpdatedDate().after(lastDate)) {
                                            if (mostRecentItemDate == null || message.getUpdatedDate().after(mostRecentItemDate)) {
                                                mostRecentItemDate = message.getUpdatedDate();
                                                if (logger.isTraceEnabled()) logger.trace("No feed date. Using item updated date : {}", feedDate);
                                            }
                                        }
                                    }
                                    if (message.getPublishedDate() != null) {
                                        if (lastDate == null || message.getPublishedDate().after(lastDate)) {
                                            if (mostRecentItemDate == null || message.getPublishedDate().after(mostRecentItemDate)) {
                                                mostRecentItemDate = message.getPublishedDate();
                                                if (logger.isTraceEnabled()) logger.trace("No feed date. Using item published date : {}", feedDate);
                                            }
                                        }
                                    }
                                }


                                String description = "";
                                if (message.getDescription() != null) {
                                    description = message.getDescription().getValue();
                                }


                                // Let's define the rule for UUID generation
                                String id = UUID.nameUUIDFromBytes(description.getBytes()).toString();


                                // Let's look if object already exists
                                GetResponse oldMessage = client.prepareGet(indexName, typeName, id).execute().actionGet();
                                if (!oldMessage.isExists()) {
                                    bulkProcessor.add(indexRequest(indexName).type(typeName).id(id).source(toJson(message, riverName.getName(), feedname, raw)));


                                    if (logger.isDebugEnabled()) logger.debug("FeedMessage update detected for source [{}]", feedname != null ? feedname : "undefined");
                                    if (logger.isTraceEnabled()) logger.trace("FeedMessage is : {}", message);
                                } else {
                                    if (logger.isTraceEnabled()) logger.trace("FeedMessage {} already exist. Ignoring", id);
                                }
                            }


                            if (feedDate == null) {
                                feedDate = mostRecentItemDate;
                            }


                            if (logger.isTraceEnabled()) {
                                logger.trace("processing [_seq  ]: [{}]/[{}]/[{}], last_seq [{}]", indexName, riverName.name(), lastupdateField, feedDate);
                            }
                            // We store the lastupdate date
                            bulkProcessor.add(indexRequest("_river").type(riverName.name()).id(lastupdateField)
                                    .source(jsonBuilder().startObject().startObject("rss").field(lastupdateField, feedDate).endObject().endObject()));
                        } catch (IOException e) {
                            logger.warn("failed to add feed message entry to bulk indexing");
                        }
                    } else {
                        // Nothing new... Just relax !
                        if (logger.isDebugEnabled()) logger.debug("Nothing new in the feed... Relaxing...");
                    }


                    // #8 : Use the ttl rss field to auto adjust feed refresh rate
                    if (!ignoreTtl && feed.originalWireFeed() != null && feed.originalWireFeed() instanceof Channel) {
                        Channel channel = (Channel) feed.originalWireFeed();
                        if (channel.getTtl() > 0) {
                            int minutes = channel.getTtl();
                            if (minutes != updateRate.minutes()) {
                                updateRate = TimeValue.timeValueMinutes(minutes);
                                if (logger.isInfoEnabled())
                                    logger.info("Auto adjusting update rate with provided ttl: {}", updateRate);
                            }
                        }
                    }
                }


        try {
          if (logger.isDebugEnabled()) logger.debug("Rss river is going to sleep for {}", updateRate);
          Thread.sleep(updateRate.millis());
        } catch (InterruptedException e1) {
        }
      }
    }


        @SuppressWarnings("unchecked")
    private Date getLastDateFromRiver(String lastupdateField) {
            Date lastDate = null;
            try {
                // Do something
                client.admin().indices().prepareRefresh("_river").execute().actionGet();
                GetResponse lastSeqGetResponse =
                        client.prepareGet("_river", riverName().name(), lastupdateField).execute().actionGet();
                if (lastSeqGetResponse.isExists()) {
                    Map<String, Object> rssState = (Map<String, Object>) lastSeqGetResponse.getSourceAsMap().get("rss");


                    if (rssState != null) {
                        Object lastupdate = rssState.get(lastupdateField);
                        if (lastupdate != null) {
                            String strLastDate = lastupdate.toString();
                            lastDate = ISODateTimeFormat.dateOptionalTimeParser().parseDateTime(strLastDate).toDate();
                        }
                    }
                } else {
                    // First call
                    if (logger.isDebugEnabled()) logger.debug("{} doesn't exist", lastupdateField);
                }
            } catch (Exception e) {
                logger.warn("failed to get _lastupdate, throttling....", e);
            }
            return lastDate;
        }
    }
}
Source Code of org.elasticsearch.river.rss.RssRiver$RSSParser

Related Classes of org.elasticsearch.river.rss.RssRiver$RSSParser