package edu.isi.karma.semantictypes.typinghandler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.isi.karma.modeling.semantictypes.ISemanticTypeModelHandler;
import edu.isi.karma.modeling.semantictypes.SemanticTypeLabel;
import edu.isi.karma.semantictypes.tfIdf.Indexer;
import edu.isi.karma.semantictypes.tfIdf.Searcher;
import edu.isi.karma.webserver.ServletContextParameterMap;
import edu.isi.karma.webserver.ServletContextParameterMap.ContextParameter;
/**
* This is the API class for the semantic typing module, implementing the
* combined approach of TF-IDF based cosine similarity and Kolmogorov-Smirnov
* test approaches for textual and numeric respectively by
* Ramnandan.S.K and Amol Mittal.
*
* @author ramnandan
*
*/
public class LuceneBasedSTModelHandler implements ISemanticTypeModelHandler {
static Logger logger = LoggerFactory
.getLogger(LuceneBasedSTModelHandler.class.getSimpleName());
private ArrayList<String> allowedCharacters;
private boolean modelEnabled = false;
private String indexDirectory;
/**
* NOTE: Currently, TF-IDF based approach is used for both textual and
* numeric data due to bug in KS test on Apache Commons Math.
*
* TODO: Integrate KS test when this bug is resolved :
* https://issues.apache.org/jira/browse/MATH-1131
*/
public LuceneBasedSTModelHandler() {
allowedCharacters = allowedCharacters();
indexDirectory = ServletContextParameterMap
.getParameterValue(ContextParameter.SEMTYPE_MODEL_DIRECTORY);
}
/**
* Adds the passed list of examples for training
*
* @param label
* True label for the list of example.
* @param examples
* List of example strings.
* @return True if success, else False
*/
@Override
public synchronized boolean addType(String label, List<String> examples) {
boolean savingSuccessful = false;
// running basic sanity checks in the input arguments
if (label == null || label.trim().length() == 0 || examples.size() == 0) {
logger.warn("@label argument cannot be null or an empty string and the @examples list cannot be empty.");
return false;
}
label = label.trim();
ArrayList<String> cleanedExamples = new ArrayList<String>();
cleanedExamplesList(examples, cleanedExamples);
// making sure that the condition where the examples list is not empty
// but contains junk only is not accepted
if (cleanedExamples.size() == 0) {
logger.warn("@examples list contains forbidden characters only. The allowed characters are "
+ allowedCharacters);
return false;
}
// if the column is textual
try {
savingSuccessful = indexTrainingColumn(label, cleanedExamples);
} catch (IOException e) {
e.printStackTrace();
}
return savingSuccessful;
}
/**
* Indexes the given training column for a specific label
*
* @param label
* @param selectedExamples
* @return
* @throws IOException
*/
private boolean indexTrainingColumn(String label,
ArrayList<String> selectedExamples) throws IOException {
// treat content of column as single document
StringBuilder sb = new StringBuilder();
for (String ex : selectedExamples) {
sb.append(ex);
sb.append(" ");
}
// check if semantic label already exists
Document labelDoc = null; // document corresponding to existing semantic label if exists
if (indexDirectoryExists()) {
try {
// check if semantic label already exists in index
Searcher searcher = new Searcher(indexDirectory,
Indexer.LABEL_FIELD_NAME);
try {
labelDoc = searcher.getDocumentForLabel(label);
} finally {
searcher.close();
}
} catch (Exception e) {
// Ignore, the searcher might not work if index is empty.
}
}
// index the document
Indexer indexer = new Indexer(indexDirectory);
try {
indexer.open();
if (labelDoc != null) {
IndexableField existingContent = labelDoc.getField(Indexer.CONTENT_FIELD_NAME);
indexer.updateDocument(existingContent, sb.toString(), label);
} else {
indexer.addDocument(sb.toString(), label);
}
indexer.commit();
} finally {
indexer.close();
}
return true;
}
/**
* Check if index directory exists and contains files
*
* @return
*/
private boolean indexDirectoryExists() {
File dir = new File(indexDirectory);
if (dir.exists() && dir.listFiles().length > 0) {
String[] files = dir.list();
for (String file : files) {
if (file.equals("segments.gen"))
return true;
}
}
return false;
}
/**
* @param examples
* - list of examples of an unknown type
* @param numPredictions
* - required number of predictions in descending order
* @param predictedLabels
* - the argument in which the ordered list of labels is
* returned. the size of this list could be smaller than
* numPredictions if there aren't that many labels in the model
* already
* @param confidenceScores
* - the probability of the examples belonging to the labels
* returned.
* @param exampleProbabilities
* - the size() == examples.size(). It contains, for each
* example, in the same order, a double array that contains the
* probability of belonging to the labels returned in
* predictedLabels.
* @param columnFeatures
* - this Map supplies ColumnFeatures such as ColumnName, etc.
* @return True, if successful, else False
*/
@Override
public List<SemanticTypeLabel> predictType(List<String> examples,
int numPredictions) {
if (!this.modelEnabled) {
logger.warn("Semantic Type Modeling is not enabled");
return null;
}
// Sanity checks for arguments
if (examples == null || examples.size() == 0 || numPredictions <= 0) {
logger.warn("Invalid arguments. Possible problems: examples list size is zero, numPredictions is non-positive");
return null;
}
logger.debug("Predic Type for " + examples.toArray().toString());
// get top-k suggestions
if (indexDirectoryExists()) {
// construct single text for test column
StringBuilder sb = new StringBuilder();
for (String ex : examples) {
sb.append(ex);
sb.append(" ");
}
try {
Searcher predictor = new Searcher(indexDirectory,
Indexer.CONTENT_FIELD_NAME);
try {
List<SemanticTypeLabel> result = predictor.getTopK(numPredictions, sb.toString());
logger.debug("Got " + result.size() + " predictions");
return result;
} finally {
predictor.close();
}
} catch (ParseException | IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* @return True if successfully cleared the model. False, otherwise. This
* method removes all labels from the model.
*
* Currently, when only TF-IDF is used, equivalent to deleting all
* documents
*/
@Override
public boolean removeAllLabels() {
try {
Indexer indexer = new Indexer(indexDirectory);
try {
indexer.open();
indexer.deleteAllDocuments();
indexer.commit();
} finally {
indexer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
return true;
}
/**
* @param uncleanList
* List of all examples
* @param cleanedList
* List with examples that don't have unallowed chars and others
* such as nulls or empty strings This method cleans the examples
* list passed to it. Generally, it is used by other methods to
* sanitize lists passed from outside.
*/
private void cleanedExamplesList(List<String> uncleanList,
List<String> cleanedList) {
cleanedList.clear();
for (String example : uncleanList) {
if (example != null) {
String trimmedExample;
trimmedExample = getSanitizedString(example);
if (trimmedExample.length() != 0) {
cleanedList.add(trimmedExample);
}
}
}
}
/**
* @param unsanitizedString
* String to be sanitized
* @return sanitizedString
*/
private String getSanitizedString(String unsanitizedString) {
String sanitizedString;
sanitizedString = "";
for (int i = 0; i < unsanitizedString.length(); i++) {
String charAtIndex;
charAtIndex = unsanitizedString.substring(i, i + 1);
if (allowedCharacters.contains(charAtIndex)) {
sanitizedString += charAtIndex;
}
}
return sanitizedString;
}
/**
* @return Returns list of allowed Characters
*/
private ArrayList<String> allowedCharacters() {
ArrayList<String> allowed = new ArrayList<String>();
// Adding A-Z
for (int c = 65; c <= 90; c++) {
allowed.add(new Character((char) c).toString());
}
// Adding a-z
for (int c = 97; c <= 122; c++) {
allowed.add(new Character((char) c).toString());
}
// Adding 0-9
for (int c = 48; c <= 57; c++) {
allowed.add(new Character((char) c).toString());
}
allowed.add(" "); // adding space
allowed.add("."); // adding dot
allowed.add("%");
allowed.add("@");
allowed.add("_");
allowed.add("-");
allowed.add("*");
allowed.add("(");
allowed.add(")");
allowed.add("[");
allowed.add("]");
allowed.add("+");
allowed.add("/");
allowed.add("&");
allowed.add(":");
allowed.add(",");
allowed.add(";");
allowed.add("?");
return allowed;
}
@Override
public boolean readModelFromFile(String filepath) {
indexDirectory = filepath;
return true;
}
@Override
public void setModelHandlerEnabled(boolean enabled) {
this.modelEnabled = enabled;
}
}