/*
* Copyright 2010 Outerthought bvba
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lilyproject.indexer.engine;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import com.google.common.collect.Lists;
import com.google.common.primitives.Ints;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.lilyproject.indexer.model.indexerconf.DerefValue;
import org.lilyproject.indexer.model.indexerconf.FieldValue;
import org.lilyproject.indexer.model.indexerconf.Follow;
import org.lilyproject.indexer.model.indexerconf.FollowCallback;
import org.lilyproject.indexer.model.indexerconf.Formatter;
import org.lilyproject.indexer.model.indexerconf.IndexUpdateBuilder;
import org.lilyproject.indexer.model.indexerconf.IndexValue;
import org.lilyproject.indexer.model.indexerconf.LilyIndexerConf;
import org.lilyproject.indexer.model.indexerconf.Value;
import org.lilyproject.repository.api.Blob;
import org.lilyproject.repository.api.FieldType;
import org.lilyproject.repository.api.LRepository;
import org.lilyproject.repository.api.Record;
import org.lilyproject.repository.api.RepositoryException;
import org.lilyproject.util.io.Closer;
import org.lilyproject.util.repo.SystemFields;
/**
* Evaluates an index field value (a {@link Value}) to a value.
*/
public class ValueEvaluator {
private Log log = LogFactory.getLog(getClass());
private LilyIndexerConf conf;
private SystemFields systemFields;
private Parser tikaParser = new AutoDetectParser();
public ValueEvaluator(LilyIndexerConf conf) {
this.conf = conf;
this.systemFields = conf.getSystemFields();
}
/**
* Evaluates a value for a given record & vtag.
* @return null if there is no value
*/
public List<String> eval(String table, Value valueDef, IndexUpdateBuilder indexUpdateBuilder)
throws RepositoryException, IOException, InterruptedException {
List<IndexValue> indexValues = evalValue(valueDef, indexUpdateBuilder);
if (indexValues == null || indexValues.size() == 0) {
return null;
}
LRepository repository = indexUpdateBuilder.getRepository();
if (valueDef.extractContent()) {
return extractContent(table, indexValues, repository);
}
Formatter formatter = conf.getFormatters().getFormatter(valueDef.getFormatter());
return formatter.format(indexValues, repository);
}
/**
* Direct 'evaluation' (content extraction, formatting) of a given field
* from a record. Should only be called if the field is present in the
* record.
*/
public List<String> format(String table, Record record, FieldType fieldType, boolean extractContent, String formatterName,
LRepository repository) throws InterruptedException {
Object value = record.getField(fieldType.getName());
List<IndexValue> indexValues;
if (fieldType.getValueType().getBaseName().equals("LIST")) {
List<Object> values = (List<Object>) value;
indexValues = new ArrayList<IndexValue>(values.size());
for (int i = 0; i < values.size(); i++) {
indexValues.add(new IndexValue(record, fieldType, i, values.get(i)));
}
} else {
indexValues = Collections.singletonList(new IndexValue(record, fieldType, value));
}
if (fieldType.getValueType().getDeepestValueType().getBaseName().equals("BLOB") && extractContent) {
return extractContent(table, indexValues, repository);
}
Formatter formatter = conf.getFormatters().getFormatter(formatterName);
return formatter.format(indexValues, repository);
}
private List<String> extractContent(String table, List<IndexValue> indexValues, LRepository repository) {
// At this point we can be sure the value will be a blob, this is
// validated during
// the construction of the indexer conf.
List<String> result = new ArrayList<String>(indexValues.size());
Deque<Integer> indexes = new ArrayDeque<Integer>();
for (IndexValue indexValue : indexValues) {
indexes.clear();
if (indexValue.listIndex != null) {
indexes.addLast(indexValue.listIndex);
}
extractContent(table, indexValue.value, indexes, indexValue.record, indexValue.fieldType, result, repository);
}
return result.isEmpty() ? null : result;
}
private void extractContent(String table, Object value, Deque<Integer> indexes, Record record, FieldType fieldType,
List<String> result, LRepository repository) {
if (value instanceof List) { // this covers both LIST and PATH types
List values = (List) value;
for (int i = 0; i < values.size(); i++) {
indexes.addLast(i);
extractContent(table, values.get(i), indexes, record, fieldType, result, repository);
indexes.removeLast();
}
} else {
extractContent(table, value, record, fieldType, Ints.toArray(indexes), result, repository);
}
}
private void extractContent(String table, Object value, Record record, FieldType fieldType, int[] indexes, List<String> result,
LRepository repository) {
Blob blob = (Blob) value;
InputStream is = null;
// TODO make write limit configurable
WriteOutContentHandler woh = new WriteOutContentHandler(500 * 1000); // 500K limit (Tika default: 100K)
BodyContentHandler ch = new BodyContentHandler(woh);
try {
is = repository.getTable(table).getInputStream(record, fieldType.getName(), indexes);
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType());
if (blob.getName() != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName());
}
ParseContext parseContext = new ParseContext();
tikaParser.parse(is, ch, metadata, parseContext);
} catch (Throwable t) {
if (woh.isWriteLimitReached(t)) {
// ok, we'll just add use the partial result
if (log.isInfoEnabled()) {
log.info("Blob extraction: write limit reached. Field '" + fieldType.getName() + "', record '"
+ record.getId() + "'.");
}
} else {
log.error(
"Error extracting blob content. Field '" + fieldType.getName() + "', record '" + record.getId()
+ "'.", t);
return;
}
} finally {
Closer.close(is);
}
String text = ch.toString();
if (text.length() > 0) {
result.add(text);
}
}
private List<IndexValue> evalValue(Value value, IndexUpdateBuilder indexUpdateBuilder)
throws RepositoryException, IOException, InterruptedException {
if (value instanceof FieldValue) {
return getValue(indexUpdateBuilder, ((FieldValue)value).getTargetFieldType(), null);
} else if (value instanceof DerefValue) {
List<IndexValue> result = Lists.newArrayList();
evalDerefValue((DerefValue) value, indexUpdateBuilder, result);
return result;
} else {
throw new RuntimeException("Unexpected type of value: " + value.getClass().getName());
}
}
/**
* @param indexValues
* optional, if supplied values will be added to this list,
* otherwise a new list will be created and returned
* @return null if there's no value
*/
private List<IndexValue> getValue(IndexUpdateBuilder indexUpdateBuilder, FieldType fieldType, List<IndexValue> indexValues) throws RepositoryException, InterruptedException {
Record record = indexUpdateBuilder.getRecordContext().record;
Object value = getValue(indexUpdateBuilder, fieldType);
List<IndexValue> result;
if (value == null) {
return null;
}
if (fieldType.getValueType().getBaseName().equals("LIST")) {
List<Object> values = (List<Object>) value;
result = indexValues != null ? indexValues : new ArrayList<IndexValue>(values.size());
for (int i = 0; i < values.size(); i++) {
result.add(new IndexValue(record, fieldType, i, values.get(i)));
}
return result;
} else {
if (indexValues != null) {
indexValues.add(new IndexValue(record, fieldType, value));
result = indexValues;
} else {
result = Collections.singletonList(new IndexValue(record, fieldType, value));
}
}
return result;
}
private void evalDerefValue(DerefValue deref, IndexUpdateBuilder indexUpdateBuilder, List<IndexValue> values)
throws RepositoryException, IOException, InterruptedException {
evalDerefValue(deref, 0, indexUpdateBuilder, values);
}
/**
* Evaluates a follow and returns the records that it points to. This method
* returns null in case there are no results (link doesn't exist, points to
* non-existing doc, etc.).
*/
public void evalDerefValue(final DerefValue deref, final int fieldNum,
final IndexUpdateBuilder indexUpdateBuilder, final List<IndexValue> values)
throws RepositoryException, IOException, InterruptedException {
if (fieldNum >= deref.getFollows().size()) {
getValue(indexUpdateBuilder, deref.getTargetFieldType(), values);
return;
}
Follow follow = deref.getFollows().get(fieldNum);
follow.follow(indexUpdateBuilder, new FollowCallback() {
@Override
public void call() throws RepositoryException, IOException, InterruptedException {
evalDerefValue(deref, fieldNum + 1, indexUpdateBuilder, values);
}
});
}
private Object getValue(IndexUpdateBuilder indexUpdateBuilder, FieldType fieldType) throws RepositoryException,
InterruptedException {
Object value = null;
Record record = indexUpdateBuilder.getRecordContext().record;
if (systemFields.isSystemField(fieldType.getName())) {
if (record != null) {
value = systemFields.eval(record, fieldType, indexUpdateBuilder.getRepository().getTypeManager());
}
} else {
indexUpdateBuilder.addDependency(fieldType.getId());
if (record != null && record.hasField(fieldType.getName())) {
value = record.getField(fieldType.getName());
}
}
return value;
}
}