package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;
* IK查询分析器
* 实现了对分词歧义结果的非冲突排列组合
* 有效的优化对歧义关键词的搜索命中
* 针对IK Analyzer V3的优化实现
* @author 林良益
public final class IKQueryParser {
private static ThreadLocal<Map<String , TokenBranch>> keywordCacheThreadLocal
= new ThreadLocal<Map<String , TokenBranch>>();
private static boolean isMaxWordLength = false;
* 设置分词策略
* isMaxWordLength = true 采用最大词长分词
* @param isMaxWordLength
public static void setMaxWordLength(boolean isMaxWordLength) {
IKQueryParser.isMaxWordLength = isMaxWordLength ;
* 优化query队列
* 减少Query表达式的嵌套
* @param queries
* @return
private static Query optimizeQueries(List<Query> queries){
//生成当前branch 的完整query
if(queries.size() == 0){
return null;
}else if(queries.size() == 1){
return queries.get(0);
BooleanQuery mustQueries = new BooleanQuery();
for(Query q : queries){
mustQueries.add(q, Occur.MUST);
return mustQueries;
* 获取线程本地的解析缓存
* @return
private static Map<String , TokenBranch> getTheadLocalCache(){
Map<String , TokenBranch> keywordCache = keywordCacheThreadLocal.get();
if(keywordCache == null){
keywordCache = new HashMap<String , TokenBranch>(4);
return keywordCache;
* 缓存解析结果的博弈树
* @param query
* @return
private static TokenBranch getCachedTokenBranch(String query){
Map<String , TokenBranch> keywordCache = getTheadLocalCache();
return keywordCache.get(query);
* 缓存解析结果的博弈树
* @param query
* @return
private static void cachedTokenBranch(String query , TokenBranch tb){
Map<String , TokenBranch> keywordCache = getTheadLocalCache();
keywordCache.put(query, tb);
* 单连续字窜(不带空格符)单Field查询分析
* @param field
* @param query
* @return
* @throws IOException
private static Query _parse(String field , String query) throws IOException{
if(field == null){
throw new IllegalArgumentException("parameter \"field\" is null");
if(query == null || "".equals(query.trim())){
return new TermQuery(new Term(field));
TokenBranch root = getCachedTokenBranch(query);
if(root != null){
return optimizeQueries(root.toQueries(field));
root = new TokenBranch(null);
StringReader input = new StringReader(query.trim());
IKSegmentation ikSeg = new IKSegmentation(input , isMaxWordLength);
for(Lexeme lexeme = ikSeg.next() ; lexeme != null ; lexeme = ikSeg.next()){
cachedTokenBranch(query , root);
return optimizeQueries(root.toQueries(field));
* 单条件,单Field查询分析
* @param field -- Document field name
* @param query -- keyword
* @return Query 查询逻辑对象
* @throws IOException
public static Query parse(String field , String query) throws IOException{
if(field == null){
throw new IllegalArgumentException("parameter \"field\" is null");
String[] qParts = query.split("\\s");
if(qParts.length > 1){
BooleanQuery resultQuery = new BooleanQuery();
for(String q : qParts){
Query partQuery = _parse(field , q);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, Occur.SHOULD);
return resultQuery;
return _parse(field , query);
* 多Field,单条件查询分析
* @param fields -- Document fields name
* @param query -- keyword
* @return Query 查询逻辑对象
* @throws IOException
public static Query parseMultiField(String[] fields , String query) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
BooleanQuery resultQuery = new BooleanQuery();
for(String field : fields){
if(field != null){
Query partQuery = parse(field , query);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, Occur.SHOULD);
return resultQuery;
* 多Field,单条件,多Occur查询分析
* @param fields -- Document fields name
* @param query -- keyword
* @param flags -- BooleanClause
* @return Query 查询逻辑对象
* @throws IOException
public static Query parseMultiField(String[] fields , String query , BooleanClause.Occur[] flags) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
if(flags == null){
throw new IllegalArgumentException("parameter \"flags\" is null");
if (flags.length != fields.length){
throw new IllegalArgumentException("flags.length != fields.length");
BooleanQuery resultQuery = new BooleanQuery();
for(int i = 0; i < fields.length; i++){
if(fields[i] != null){
Query partQuery = parse(fields[i] , query);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, flags[i]);
return resultQuery;
* 多Field多条件查询分析
* @param fields
* @param queries
* @return Query 查询逻辑对象
* @throws IOException
public static Query parseMultiField(String[] fields , String[] queries) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
if(queries == null){
throw new IllegalArgumentException("parameter \"queries\" is null");
if (queries.length != fields.length){
throw new IllegalArgumentException("queries.length != fields.length");
BooleanQuery resultQuery = new BooleanQuery();
for(int i = 0; i < fields.length; i++){
if(fields[i] != null){
Query partQuery = parse(fields[i] , queries[i]);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, Occur.SHOULD);
return resultQuery;
* 多Field,多条件,多Occur查询分析
* @param fields
* @param queries
* @param flags
* @return Query 查询逻辑对象
* @throws IOException
public static Query parseMultiField(String[] fields , String[] queries , BooleanClause.Occur[] flags) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
if(queries == null){
throw new IllegalArgumentException("parameter \"queries\" is null");
if(flags == null){
throw new IllegalArgumentException("parameter \"flags\" is null");
if (!(queries.length == fields.length && queries.length == flags.length)){
throw new IllegalArgumentException("queries, fields, and flags array have have different length");
BooleanQuery resultQuery = new BooleanQuery();
for(int i = 0; i < fields.length; i++){
if(fields[i] != null){
Query partQuery = parse(fields[i] , queries[i]);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, flags[i]);
return resultQuery;
* 词元分支
* 当分词出现歧义时,采用词元分支容纳不同的歧义组合
* @author 林良益
private static class TokenBranch{
private static final int REFUSED = -1;
private static final int ACCEPTED = 0;
private static final int TONEXT = 1;
private int leftBorder;
private int rightBorder;
private Lexeme lexeme;
private List<TokenBranch> acceptedBranchs;
private TokenBranch nextBranch;
TokenBranch(Lexeme lexeme){
if(lexeme != null){
this.lexeme = lexeme;
this.leftBorder = lexeme.getBeginPosition();
this.rightBorder = lexeme.getEndPosition();
public int getLeftBorder() {
return leftBorder;
public int getRightBorder() {
return rightBorder;
public Lexeme getLexeme() {
return lexeme;
public List<TokenBranch> getAcceptedBranchs() {
return acceptedBranchs;
public TokenBranch getNextBranch() {
return nextBranch;
public int hashCode(){
if(this.lexeme == null){
return 0;
return this.lexeme.hashCode() * 37;
public boolean equals(Object o){
if(o == null){
return false;
if(this == o){
return true;
if(o instanceof TokenBranch){
TokenBranch other = (TokenBranch)o;
if(this.lexeme == null ||
other.getLexeme() == null){
return false;
return this.lexeme.equals(other.getLexeme());
return false;
* 组合词元分支
* @param _lexeme
* @return 返回当前branch能否接收词元对象
boolean accept(Lexeme _lexeme){
* 检查新的lexeme 对当前的branch 的可接受类型
* acceptType : REFUSED 不能接受
* acceptType : ACCEPTED 接受
* acceptType : TONEXT 由相邻分支接受
int acceptType = checkAccept(_lexeme);
// REFUSE 情况
return false;
if(acceptedBranchs == null){
acceptedBranchs = new ArrayList<TokenBranch>(2);
acceptedBranchs.add(new TokenBranch(_lexeme));
boolean acceptedByChild = false;
for(TokenBranch childBranch : acceptedBranchs){
acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild;
acceptedBranchs.add(new TokenBranch(_lexeme));
if(_lexeme.getEndPosition() > this.rightBorder){
this.rightBorder = _lexeme.getEndPosition();
case TONEXT :
if(this.nextBranch == null){
this.nextBranch = new TokenBranch(null);
return true;
* 将分支数据转成Query逻辑
* @return
List<Query> toQueries(String fieldName){
List<Query> queries = new ArrayList<Query>(1);
//生成当前branch 的query
if(lexeme != null){
queries.add(new TermQuery(new Term(fieldName , lexeme.getLexemeText())));
//生成child branch 的query
if(acceptedBranchs != null && acceptedBranchs.size() > 0){
if(acceptedBranchs.size() == 1){
Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName));
if(onlyOneQuery != null){
BooleanQuery orQuery = new BooleanQuery();
for(TokenBranch childBranch : acceptedBranchs){
Query childQuery = optimizeQueries(childBranch.toQueries(fieldName));
if(childQuery != null){
orQuery.add(childQuery, Occur.SHOULD);
if(orQuery.getClauses().length > 0){
if(nextBranch != null){
return queries;
* 判断指定的lexeme能否被当前的branch接受
* @param lexeme
* @return 返回接受的形式
private int checkAccept(Lexeme _lexeme){
int acceptType = 0;
if(_lexeme == null){
throw new IllegalArgumentException("parameter:lexeme is null");
if(null == this.lexeme){//当前的branch是一个不交叠(ROOT)的分支
if(this.rightBorder > 0 //说明当前branch内至少有一个lexeme
&& _lexeme.getBeginPosition() >= this.rightBorder){
//_lexeme 与 当前的branch不相交
acceptType = TONEXT;
acceptType = ACCEPTED;
if(_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()){
//_lexeme 的位置比 this.lexeme还靠前(这种情况不应该发生)
acceptType = REFUSED;
}else if(_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition()
&& _lexeme.getBeginPosition() < this.lexeme.getEndPosition()){
// _lexeme 与 this.lexeme相交
acceptType = REFUSED;
}else if(_lexeme.getBeginPosition() >= this.lexeme.getEndPosition()
&& _lexeme.getBeginPosition() < this.rightBorder){
//_lexeme 与 this.lexeme 不相交, 但_lexeme 与 当前的branch相交
acceptType = ACCEPTED;
}else{//_lexeme.getBeginPosition() >= this.rightBorder
//_lexeme 与 当前的branch不相交
acceptType= TONEXT;
return acceptType;