Package com.flaptor.indextank.search

Source Code of com.flaptor.indextank.search.SnippetSearcher$Window

/*
* Copyright (c) 2011 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package com.flaptor.indextank.search;

import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;

import com.flaptor.indextank.index.Document;
import com.flaptor.indextank.query.AToken;
import com.flaptor.indextank.query.IndexEngineParser;
import com.flaptor.indextank.query.Query;
import com.flaptor.indextank.query.TermQuery;
import com.flaptor.indextank.storage.alternatives.DocumentStorage;
import com.flaptor.indextank.util.CharacterTranslator;
import com.flaptor.util.Execute;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

public class SnippetSearcher extends AbstractDocumentSearcher {
  private static final Logger logger = Logger.getLogger(Execute.whoAmI());
    private final DocumentSearcher delegate;
  private final DocumentStorage storage;
  private final IndexEngineParser parser;
    private final Map<SnippeterType, Snippeter> snippeters;

    public enum SnippeterType { HTML_AWARE, LINE_AWARE }

    public SnippetSearcher(DocumentSearcher searcher, DocumentStorage storage, IndexEngineParser parser){
    Preconditions.checkNotNull(searcher);
    Preconditions.checkNotNull(storage);
        this.delegate = searcher;
        this.storage = storage;
        this.parser = parser;


        this.snippeters = ImmutableMap.ofSnippeterType.HTML_AWARE, new HtmlAwareSnippeter(),
                                            SnippeterType.LINE_AWARE, new LineAwareSnippeter());

    }
  

    /**
     * @see AbstractDocumentSearcher#search(Query query, int start, int limit, int scoringFunctionIndex, Map<String, String> extraParameters).
     *
     * @param extraParameters: It will process 'fetch_fields', 'snippet_fields' and 'snippet_type'.
     *      'fetch_fields' and 'snippet_fields' are comma-separated lists of field names to fetch an snippet.
     *      'snippet_type' can be either 'html' or 'lines'. 'html' is the default.
     *
     */

    @Override
    public SearchResults search(Query query, int start, int limit, int scoringFunctionIndex, Map<String, String> extraParameters) throws InterruptedException {
      // call delegate searcher
        SearchResults results = this.delegate.search(query, start, limit, scoringFunctionIndex, extraParameters);
       
        long startTime = System.currentTimeMillis();
       
        String[] fetchFields = parseFields(extraParameters, "fetch");
        String[] snippetFields = parseFields(extraParameters, "snippet");
        Set<TermQuery> positiveTerms = query.getRoot().getPositiveTerms();
               
        // find out which snippeter type is the right one for this query
        String snType = extraParameters.get("snippet_type");
        Snippeter sn = null;
        if (null == snType || "html".equalsIgnoreCase(snType)) {
            sn = this.snippeters.get(SnippeterType.HTML_AWARE);
        } else if ("lines".equalsIgnoreCase(snType)) {
            sn = this.snippeters.get(SnippeterType.LINE_AWARE);
        } else {
            throw new IllegalArgumentException("snippet_type has to be either 'html' or 'lines'");
        }
       
        if (fetchFields.length + snippetFields.length > 0) {
          for (SearchResult result : results.getResults()) {
            Document data = storage.getDocument(result.getDocId());


            // fetch fields
            for (String field : fetchFields) {
                    // handle '*', as a fetch all
                    if ("*".equals(field.trim())){
                        // assume we get the actual fields, not a copy.
                        result.getFields().putAll(data.asMap());
                        break;
                    }
                    String text = data.getField(field);
                    if (null != text) {
                        result.setField(field, text);
                    }
        }

            // snippet fields
            for (String field : snippetFields) {
                    String text = data.getField(field);
                    if (null != text) {
                        result.setField("snippet_" + field, sn.snippet(positiveTerms, field, text));
                    }
            }
          }
        }
        long endTime = System.currentTimeMillis();
        logger.debug("(search) fetching & snippeting took: " + (endTime - startTime) + " ms.");

        return results;

    }

    @Override
    public int countMatches(Query query) throws InterruptedException {
        return this.delegate.countMatches(query);
    }
   
  private static String[] parseFields(Map<String, String> extraParameters, String key) {
    if (extraParameters.containsKey(key + "_fields")) {
          return extraParameters.get(key + "_fields").split(",");
        } else {
          return new String[0];
        }
  }
 
    private abstract class Snippeter {

        protected abstract int adjustStart(int position, String text);
        protected abstract int adjustEnd(int position, String text);

        private String snippet(Set<TermQuery> terms, String fieldName, String text) {
            Set<String> termsForField = getTermsForField(terms, fieldName);
            long t1 = System.currentTimeMillis();
            List<AToken> tokens = Lists.newArrayList(parser.parseDocumentField(fieldName, text));
            long t2 = System.currentTimeMillis();
            logger.debug(String.format("Parsing field %s took %d ms.", fieldName, t2 - t1));
            List<Integer> matches = Lists.newArrayList();
           
            for (int i = 0; i < tokens.size(); i++) {
                String termInText = tokens.get(i).getText();
               
                for (String termInQuery : termsForField) {
                    if ((termInQuery.endsWith("*") && termInText.startsWith(termInQuery.substring(0, termInQuery.length() - 1)))
                            || termInQuery.equals(termInText)) {
                        matches.add(i);
                    }
                }
            }
            if (matches.size() == 0) {
                return "";
            }
            Window window = findBestWindow(tokens, matches, 200);
            long t3 = System.currentTimeMillis();
            logger.debug(String.format("Finding best window for %d matches took %d ms.", matches.size(), t3 - t2));
            String markedText = mark(window, text);
            long t4 = System.currentTimeMillis();
            logger.debug(String.format("Marking text %d chars in %d ms.", markedText.length(), t4 - t3));
            return markedText;
        }


        private String mark(Window window, String text) {
            Preconditions.checkArgument(!window.matches.isEmpty(), "Cannot mark an empty window");
            StringBuilder buff = new StringBuilder(500);
            String open = "<b>";
            String close = "</b>";
            int current = window.start;

            // let subclasses handle where snippets start
            current = adjustStart(current, text);
            for (AToken token : window.matches) {
                escapeAndAppend(buff, text, current, token.getStartOffset());
                buff.append(open);
                int start = token.getStartOffset();
                int endOffset = token.getEndOffset();
                escapeAndAppend(buff, text, start, endOffset);
                buff.append(close);
                current = endOffset;
            }

            // let subclasses handle where snippets end
            int finish = window.end;
            finish = adjustEnd(finish, text);
            escapeAndAppend(buff, text, current, finish);
            return buff.toString();
        }
       
        private Window findBestWindow(List<AToken> tokens, List<Integer> matches, int maxSize) {
            if (matches.size() == 0) {
                return null;
            }
            List<AToken> mtokens = asTokens(matches, tokens);
            List<Integer> best = null;
            float bestScore = 0f;
            int left = 0;
            int right = 0;
        while (right < matches.size()) {
            right++;
              while (mtokens.get(right - 1).getEndOffset() - mtokens.get(left).getStartOffset() > maxSize) {
              left++;
            }
              List<AToken> candidate = mtokens.subList(left, right);
            float score = scoreWindow(candidate);
            if (score > bestScore) {
              bestScore = score;
              best = matches.subList(left, right);
            }
        }
            return getWindowContext(tokens, best);
        }
        private Window getWindowContext(List<AToken> tokens, List<Integer> best) {
            int left = best.get(0);
            int right = best.get(best.size()-1);
            Window window = new Window();
            window.matches = asTokens(best, tokens);
            window.start = tokens.get(Math.max(0, left - 5)).getStartOffset();
            window.end = tokens.get(Math.min(right + 24, tokens.size()-1)).getEndOffset();
            return window;
        }
       
        private float scoreWindow(List<AToken> candidate) {
            Set<String> terms = Sets.newHashSet();
            for (AToken token : candidate) {
                terms.add(token.getText());
            }
            return candidate.size() * terms.size() * terms.size();
        }

    }


    /**
     * A Snippeter that tries not to cut HTML entities
     */
    private class HtmlAwareSnippeter extends Snippeter {

        // Snippeter abstract methods
      
        protected int adjustStart(int position, String text) {

            // tokenizers may cut off & on entities  .. fix that
            if (position > 0 && text.charAt(position -1) == '&') {
                return position -1;
            }

            return position;
        }
       
        protected int adjustEnd(int position, String text){

            // tokenizers miss final ; on entities. Try to fix that
            return position;
        }

    }

    /**
     * A Snippeter that returns complete lines.
     */
    private class LineAwareSnippeter extends Snippeter {

        // Snippeter abstract methods
       
        protected int adjustEnd(int finish, String text) {
            while (finish < text.length() && text.charAt(finish) != '\n') {
                finish++;
            }

            if (finish < text.length()) {
                // loop above ended because of text.charAt ..
                // return the endline
                finish++;
            }
            return finish;
        }

        protected int adjustStart(int current, String text) {
            while (current > 0 && text.charAt(current-1) != '\n') {
                current--;
            }
            return current;
        }

    }


    private List<AToken> asTokens(List<Integer> matches, final List<AToken> tokens) {
        return Lists.transform(matches, new Function<Integer, AToken>() {
            @Override
            public AToken apply(Integer pair) {
                return tokens.get(pair);
            }
        });
    }
   
    private static class Window {
      int start;
      int end;
        List<AToken> matches = Lists.newArrayList();
    }
   
    private Set<String> getTermsForField(Set<TermQuery> terms, String fieldName) {
        Set<String> retval = new HashSet<String>();
        for (TermQuery t : terms) {
            if (t.getField().equals(fieldName)) {
                retval.add(t.getTerm());
            }
        }
        return retval;
    }

    private void escapeAndAppend(StringBuilder dest, String str, int start, int offset) {
        if (dest == null ) {
            throw new IllegalArgumentException ("The Writer must not be null.");
        }
        if (str == null) {
            return;
        }

        CharacterTranslator.HTML4.escape(dest, str, start, offset);

    }

}
TOP

Related Classes of com.flaptor.indextank.search.SnippetSearcher$Window

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.