Package net.yacy.cora.services.federated.solr

Source Code of net.yacy.cora.services.federated.solr.SolrScheme

/**
*  SolrScheme
*  Copyright 2011 by Michael Peter Christen
*  First released 14.04.2011 at http://yacy.net
*
*  $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
*  $LastChangedRevision: 7654 $
*  $LastChangedBy: orbiter $
*
*  This library is free software; you can redistribute it and/or
*  modify it under the terms of the GNU Lesser General Public
*  License as published by the Free Software Foundation; either
*  version 2.1 of the License, or (at your option) any later version.
*
*  This library is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
*  Lesser General Public License for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program in the file lgpl21.txt
*  If not, see <http://www.gnu.org/licenses/>.
*/

package net.yacy.cora.services.federated.solr;


import java.io.File;
import java.net.InetAddress;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;

import org.apache.solr.common.SolrInputDocument;

public class SolrScheme extends ConfigurationSet {

    /**
     * initialize with an empty ConfigurationSet which will cause that all the index
     * attributes are used
     */
    public SolrScheme() {
        super();
    }

    /**
     * initialize the scheme with a given configuration file
     * the configuration file simply contains a list of lines with keywords
     * @param configurationFile
     */
    public SolrScheme(final File configurationFile) {
        super(configurationFile);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final String value) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final Date value) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final int value) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final String[] value) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final float value) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final boolean value) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value);
    }

    private void addSolr(final SolrInputDocument solrdoc, final String key, final String value, final float boost) {
        if (isEmpty() || contains(key)) solrdoc.setField(key, value, boost);
    }

    public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) {
        // we user the SolrCell design as index scheme
        final SolrInputDocument solrdoc = new SolrInputDocument();
        final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
        addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
        addSolr(solrdoc, "id", id);
        addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
        final InetAddress address = Domains.dnsResolve(digestURI.getHost());
        if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
        if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
        addSolr(solrdoc, "title", yacydoc.dc_title());
        addSolr(solrdoc, "author", yacydoc.dc_creator());
        addSolr(solrdoc, "description", yacydoc.dc_description());
        addSolr(solrdoc, "content_type", yacydoc.dc_format());
        addSolr(solrdoc, "last_modified", header.lastModified());
        addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
        final String content = UTF8.String(yacydoc.getTextBytes());
        addSolr(solrdoc, "text_t", content);
        if (isEmpty() || contains("wordcount_i")) {
            final int contentwc = content.split(" ").length;
            addSolr(solrdoc, "wordcount_i", contentwc);
        }

        // path elements of link
        final String path = digestURI.getPath();
        if (path != null && (isEmpty() || contains("attr_paths"))) {
            final String[] paths = path.split("/");
            if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
        }

        // list all links
        final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
        int c = 0;
        if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
        if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
        if (isEmpty() || contains("attr_inboundlinks")) {
            final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
            for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
                final Properties p = alllinks.get(url);
                final String name = p.getProperty("name", "");
                final String rel = p.getProperty("rel", "");
                inboundlinks[c++] =
                    "<a href=\"" + url.toNormalform(false, false) + "\"" +
                    (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
                    ">" +
                    ((name.length() > 0) ? name : "") + "</a>";
            }
            addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
        }
        c = 0;
        if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
        if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
        if (isEmpty() || contains("attr_outboundlinks")) {
            final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
            for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
                final Properties p = alllinks.get(url);
                final String name = p.getProperty("name", "");
                final String rel = p.getProperty("rel", "");
                outboundlinks[c++] =
                    "<a href=\"" + url.toNormalform(false, false) + "\"" +
                    (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
                    ">" +
                    ((name.length() > 0) ? name : "") + "</a>";
            }
            addSolr(solrdoc, "attr_outboundlinks", outboundlinks);
        }
        // charset
        addSolr(solrdoc, "charset_s", yacydoc.getCharset());

        // coordinates
        if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
            addSolr(solrdoc, "lon_coordinate", yacydoc.lon());
            addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
        }
        addSolr(solrdoc, "httpstatus_i", 200);
        final Object parser = yacydoc.getParserObject();
        if (parser instanceof ContentScraper) {
            final ContentScraper html = (ContentScraper) parser;

            // header tags
            int h = 0;
            int f = 1;
            for (int i = 1; i <= 6; i++) {
                final String[] hs = html.getHeadlines(i);
                h = h | (hs.length > 0 ? f : 0);
                f = f * 2;
                addSolr(solrdoc, "attr_h" + i, hs);
            }
            addSolr(solrdoc, "htags_i", h);

            // canonical tag
            if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));

            // meta tags
            final Map<String, String> metas = html.getMetas();
            final String robots = metas.get("robots");
            if (robots != null) addSolr(solrdoc, "metarobots_t", robots);
            final String generator = metas.get("generator");
            if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);

            // bold, italic
            final String[] bold = html.getBold();
            addSolr(solrdoc, "boldcount_i", bold.length);
            if (bold.length > 0) {
                addSolr(solrdoc, "attr_bold", bold);
                if (isEmpty() || contains("attr_boldcount")) {
                    addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
                }
            }
            final String[] italic = html.getItalic();
            addSolr(solrdoc, "italiccount_i", italic.length);
            if (italic.length > 0) {
                addSolr(solrdoc, "attr_italic", italic);
                if (isEmpty() || contains("attr_italiccount")) {
                    addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
                }
            }
            final String[] li = html.getLi();
            addSolr(solrdoc, "licount_i", li.length);
            if (li.length > 0) addSolr(solrdoc, "attr_li", li);

            // images
            if (isEmpty() || contains("attr_images")) {
                final Collection<ImageEntry> imagesc = html.getImages().values();
                final String[] images = new String[imagesc.size()];
                c = 0;
                for (final ImageEntry ie: imagesc) images[c++] = ie.toString();
                addSolr(solrdoc, "imagescount_i", images.length);
                if (images.length > 0) addSolr(solrdoc, "attr_images", images);
            }

            // style sheets
            if (isEmpty() || contains("attr_css")) {
                final Map<MultiProtocolURI, String> csss = html.getCSS();
                final String[] css = new String[csss.size()];
                c = 0;
                for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
                    css[c++] =
                        "<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
                        " href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
                }
                addSolr(solrdoc, "csscount_i", css.length);
                if (css.length > 0) addSolr(solrdoc, "attr_css", css);
            }

            // Scripts
            if (isEmpty() || contains("attr_scripts")) {
                final Set<MultiProtocolURI> scriptss = html.getScript();
                final String[] scripts = new String[scriptss.size()];
                c = 0;
                for (final MultiProtocolURI url: scriptss) {
                    scripts[c++] = url.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "scriptscount_i", scripts.length);
                if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
            }

            // Frames
            if (isEmpty() || contains("attr_frames")) {
                final Set<MultiProtocolURI> framess = html.getFrames();
                final String[] frames = new String[framess.size()];
                c = 0;
                for (final MultiProtocolURI entry: framess) {
                    frames[c++] = entry.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "framesscount_i", frames.length);
                if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
            }

            // IFrames
            if (isEmpty() || contains("attr_iframes")) {
                final Set<MultiProtocolURI> iframess = html.getIFrames();
                final String[] iframes = new String[iframess.size()];
                c = 0;
                for (final MultiProtocolURI entry: iframess) {
                    iframes[c++] = entry.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "iframesscount_i", iframes.length);
                if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
            }

            // flash embedded
            addSolr(solrdoc, "flash_b", html.containsFlash());

            // generic evaluation pattern
            for (final String model: html.getEvaluationModelNames()) {
                if (isEmpty() || contains("attr_" + model)) {
                    final String[] scorenames = html.getEvaluationModelScoreNames(model);
                    if (scorenames.length > 0) {
                        addSolr(solrdoc, "attr_" + model, scorenames);
                        addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
                    }
                }
            }

            // response time
            addSolr(solrdoc, "responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
        }
        return solrdoc;
    }


    /*
     * standard solr scheme

   <field name="name" type="textgen" indexed="true" stored="true"/>
   <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />

   <field name="weight" type="float" indexed="true" stored="true"/>
   <field name="price"  type="float" indexed="true" stored="true"/>
   <field name="popularity" type="int" indexed="true" stored="true" />

   <!-- Common metadata fields, named specifically to match up with
     SolrCell metadata when parsing rich documents such as Word, PDF.
     Some fields are multiValued only because Tika currently may return
     multiple values for them.
   -->
   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="subject" type="text" indexed="true" stored="true"/>
   <field name="description" type="text" indexed="true" stored="true"/>
   <field name="comments" type="text" indexed="true" stored="true"/>
   <field name="author" type="textgen" indexed="true" stored="true"/>
   <field name="keywords" type="textgen" indexed="true" stored="true"/>
   <field name="category" type="textgen" indexed="true" stored="true"/>
   <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="last_modified" type="date" indexed="true" stored="true"/>
   <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>

     */
TOP

Related Classes of net.yacy.cora.services.federated.solr.SolrScheme

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.