Package lucandra.wikipedia

Source Code of lucandra.wikipedia.WikipediaIndexWorker

/**
* Copyright T Jake Luciani
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lucandra.wikipedia;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
import org.apache.solr.common.SolrInputDocument;

public class WikipediaIndexWorker implements Callable<Integer> {

    // each worker thread has a connection to cassandra
    private static ConcurrentLinkedQueue<List<SolrInputDocument>> allDocBuffers = new ConcurrentLinkedQueue<List<SolrInputDocument>>();
    private static ThreadLocal<CommonsHttpSolrServer> clientPool = new ThreadLocal<CommonsHttpSolrServer>();
    private static AtomicInteger connectionCounter = new AtomicInteger(0);
    private static CommonsHttpSolrServer oneClient;
    public  static final ArrayList<String> hosts = new ArrayList<String>();
    private static final Random r = new Random();
   
    static int port = 8983;
   
    //Add shutdown hook for batched commits to complete
    static {
        Runtime.getRuntime().addShutdownHook(new Thread() {
            public void run() {
                List<SolrInputDocument> docs;
                while ((docs = allDocBuffers.poll()) != null) {
                    try {
                      
                        if(!docs.isEmpty())
                            oneClient.add(docs);
                           
                    } catch (SolrServerException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }

                System.err.println("committed");
            }
        });
    }
   
    // this is the article to index
    private Article article;

    public WikipediaIndexWorker(Article article) {
        this.article = article;
    }

    private CommonsHttpSolrServer getIndexWriter() throws MalformedURLException  {
        CommonsHttpSolrServer indexWriter = clientPool.get();

        if (indexWriter == null) {
           
            if(hosts.size() == 0)
                throw new RuntimeException("no hosts defined");  
           
            int id = connectionCounter.incrementAndGet();
           
            indexWriter = new StreamingUpdateSolrServer("http://"+hosts.get(id % hosts.size())+":" + port + "/solandra/~wikassandra", 512, 8);
            indexWriter.setAllowCompression(true);
           
            clientPool.set(indexWriter);
        }

        return indexWriter;
    }

    public Integer call() throws Exception {

        CommonsHttpSolrServer indexWriter = getIndexWriter();

        SolrInputDocument doc = new SolrInputDocument();
       
        doc.addField("title", article.title);
       
        if(article.text != null)
            doc.addField("text", new String(article.text,"UTF-8"));
       
        doc.addField("url", article.url);
       
        indexWriter.add(doc);
        
        return article.getSize();
    }

}
TOP

Related Classes of lucandra.wikipedia.WikipediaIndexWorker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.