throws RegainException
{
String url = rawDocument.getUrl();
// Create a new, empty document
Document doc = new Document();
// Create the auxiliary fields
// NOTE: We do this at first, because if someone defined an auxiliary field
// having the same name as a normal field, then the field will be
// overriden by the normal field. This way we can be sure that the
// normal fields have the value we expect.
AuxiliaryField[] auxiliaryFieldArr = mConfig.getAuxiliaryFieldList();
if (auxiliaryFieldArr != null) {
for (int i = 0; i < auxiliaryFieldArr.length; i++) {
RE regex = auxiliaryFieldArr[i].getUrlRegex();
if (regex.match(url)) {
String fieldName = auxiliaryFieldArr[i].getFieldName();
String value = auxiliaryFieldArr[i].getValue();
if (value == null) {
// We have no value set -> Extract the value from the regex
value = regex.getParen(auxiliaryFieldArr[i].getUrlRegexGroup());
}
if (value != null) {
if (auxiliaryFieldArr[i].getToLowerCase()) {
value = value.toLowerCase();
}
if (mLog.isDebugEnabled()) {
mLog.debug("Adding auxiliary field: " + fieldName + "=" + value);
}
boolean store = auxiliaryFieldArr[i].isStored();
boolean index = auxiliaryFieldArr[i].isIndexed();
boolean token = auxiliaryFieldArr[i].isTokenized();
doc.add(new Field(fieldName, value,
store ? Field.Store.YES : Field.Store.NO,
index ? (token ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO));
}
}
}
}
// Add the groups of the document
if (mCrawlerAccessController != null) {
String[] groupArr = mCrawlerAccessController.getDocumentGroups(rawDocument);
// Check the Group array
RegainToolkit.checkGroupArray(mCrawlerAccessController, groupArr);
// Add the field
// NOTE: The field "groups" is tokenized, but not stemmed.
// See: RegainToolkit.WrapperAnalyzer
Iterator groupIter = Arrays.asList(groupArr).iterator();
StringBuilder tokenBuilder = new StringBuilder();
while (groupIter.hasNext()) {
tokenBuilder.append((String) groupIter.next());
tokenBuilder.append(" ");
}
//doc.add(new Field("groups", new IteratorTokenStream(groupIter)));
doc.add(new Field("groups", new WhitespaceTokenizer(new StringReader(tokenBuilder.toString()))));
}
// Add the URL of the document
doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the file name (without protocol, drive-letter and path)
String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
doc.add(new Field("filename", new WhitespaceTokenizer(new StringReader(filenameWithVariants))));
PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);
// Add the filename field for sorting
doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the document's size
int size = rawDocument.getLength();
doc.add(new Field("size", Integer.toString(size), Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the mime-type
String mimeType = rawDocument.getMimeType();
doc.add(new Field("mimetype", mimeType, Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add last modified
Date lastModified = rawDocument.getLastModified();
if (lastModified == null) {
// We don't know when the document was last modified
// -> Take the current time
lastModified = new Date();
}
doc.add(new Field("last-modified",
DateTools.dateToString(lastModified, DateTools.Resolution.DAY), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// Write the raw content to an analysis file
writeContentAnalysisFile(rawDocument);
// Add the additional fields
if (additionalFieldMap != null) {
Iterator iter = additionalFieldMap.keySet().iterator();
while (iter.hasNext()) {
String fieldName = (String) iter.next();
String fieldValue = (String) additionalFieldMap.get(fieldName);
//doc.add(new Field(fieldName, fieldValue, Field.Store.COMPRESS, Field.Index.ANALYZED));
doc.add(new Field(fieldName, fieldValue, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field(fieldName, CompressionTools.compressString(fieldValue), Field.Store.YES));
}
}
if (hasContent(cleanedContent)) {
// Write the clean content to an analysis file
writeAnalysisFile(url, "clean", cleanedContent);
// Add the cleaned content of the document
doc.add(new Field("content", cleanedContent,
this.storeContentForPreview ? Field.Store.YES : Field.Store.NO, Field.Index.ANALYZED));
} else {
// We have no content! This is a substitute document
// -> Add a "preparation-error"-field
doc.add(new Field("preparation-error", "true", Field.Store.YES,
Field.Index.NO));
}
// Check whether to use the link text as title
for (int i = 0; i < mUseLinkTextAsTitleReArr.length; i++) {
if (mUseLinkTextAsTitleReArr[i].match(url)) {
String linkText = rawDocument.getSourceLinkText();
if (linkText != null) {
title = linkText;
}
break;
}
}
// Add the document's title
if (hasContent(title)) {
doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("title_sort", title.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
} else {
doc.add(new Field("title_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
// Add the document's summary
if (! hasContent(summary) && hasContent(cleanedContent)) {
summary = createSummaryFromContent(cleanedContent);
}
if (hasContent(summary)) {
doc.add(new Field("summary", summary, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("summary", CompressionTools.compressString(summary), Field.Store.YES));
}
// Add the document's metadata
if (hasContent(metadata)) {
doc.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));
}
// Add the document's headlines
if (hasContent(headlines)) {
doc.add(new Field("headlines", headlines, Field.Store.NO,
Field.Index.ANALYZED));
}
// Add the document's path
if (pfPair.getPath() != null) {
//String asString = pathToString(path);
doc.add(new Field("path", pfPair.getPath(), Field.Store.YES, Field.Index.NO));
doc.add(new Field("path_sort", pfPair.getPath().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
// Write the path to an analysis file
writeAnalysisFile(url, "path", pfPair.getPath());
} else {
doc.add(new Field("path_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
}
return doc;
}