TrustManagerManipulator.allowAllSSL();
}
finally {}
}
PropertiesManager props = new PropertiesManager();
String sTypes = props.getHarvesterTypes();
if (overrideTypeSettings) { // (override API settings in test mode)
sTypes = "Feed,File,Database,Logstash";
}
String sType[] = sTypes.split("\\s*,\\s*");
// Add a harvester for each data type
for (String s: sType) {
if (s.equalsIgnoreCase("database")) {
try {
this.harvesters.add(new DatabaseHarvester());
}
catch (Exception e) {
logger.error(s + " not supported: " + e.getMessage());
}
catch(NoClassDefFoundError e) {
logger.error(s + " not supported: " + e.getMessage());
}
}
else if (s.equalsIgnoreCase("logstash")) {
try {
this.harvesters.add(new LogstashHarvester());
}
catch (Exception e) {
logger.error(s + " not supported: " + e.getMessage());
}
catch(NoClassDefFoundError e) {
logger.error(s + " not supported: " + e.getMessage());
}
}
else if (s.equalsIgnoreCase("file")) {
// According to http://www.ryanchapin.com/fv-b-4-648/java-lang-OutOfMemoryError--unable-to-create-new-native-thread-Exception-When-Using-SmbFileInputStream.html
// this is needed to avoid java.lang.OutOfMemoryError (intermittent - for me at least, it's happened for exactly 1 source, but consistently when it does)
System.setProperty("jcifs.resolveOrder", "DNS");
System.setProperty("jcifs.smb.client.dfs.disabled", "true");
try {
this.harvesters.add(new FileHarvester());
}
catch (Exception e) {
logger.error(s + " not supported: " + e.getMessage());
}
catch(NoClassDefFoundError e) {
logger.error(s + " not supported: " + e.getMessage());
}
}
else if (s.equalsIgnoreCase("feed")) {
try {
this.harvesters.add(new FeedHarvester());
}
catch (Exception e) {
logger.error(s + " not supported: " + e.getMessage());
}
catch(NoClassDefFoundError e) {
logger.error(s + " not supported: " + e.getMessage());
}
}
}
// Load all the extractors, set up defaults
entity_extractor_mappings = new HashMap<String, IEntityExtractor>();
text_extractor_mappings = new HashMap<String, ITextExtractor>();
// Load custom text/entity extractors
synchronized (HarvestController.class) {
if (null == customExtractors) {
customExtractors = new HashMap<String, Class>();
customExtractorClassLoader = HarvestController.class.getClassLoader();
}
// Text extractors:
String customTextList = props.getCustomTextExtractors();
if (null != customTextList) {
String customTextArray[] = customTextList.split("\\s*,\\s*");
for (String customText: customTextArray) {
if (!customExtractors.containsKey(customText)) {
// (else already have this extractor)
try {
Class customTextExtractor = customExtractorClassLoader.loadClass(customText);
ITextExtractor obj = (ITextExtractor)customTextExtractor.newInstance();
text_extractor_mappings.put(obj.getName().toLowerCase(), obj);
customExtractors.put(customText, customTextExtractor);
}
catch (Exception e) {
logger.error("ITextExtractor: Couldn't load " + customText +": " + e.getMessage(), e);
}
catch(NoClassDefFoundError e) {
logger.error("ITextExtractor: Couldn't load " + customText +": " + e.getMessage(), e);
}
}
else { // Already loaded, put in again
try {
Class customTextExtractor = customExtractors.get(customText);
ITextExtractor obj = (ITextExtractor)customTextExtractor.newInstance();
text_extractor_mappings.put(obj.getName().toLowerCase(), obj);
}
catch (Exception e) {
logger.error("ITextExtractor: Couldn't use already loaded " + customText +": " + e.getMessage(), e);
}
catch(NoClassDefFoundError e) {
logger.error("ITextExtractor: Couldn't use already loaded " + customText +": " + e.getMessage(), e);
}
}
}
}//TESTED
// Entity extractors
String customEntityList = props.getCustomEntityExtractors();
if (null != customEntityList) {
String customEntityArray[] = customEntityList.split("\\s*,\\s*");
for (String customEntity: customEntityArray) {
if (!customExtractors.containsKey(customEntity)) {
// (else already have this extractor - but may have it for text, so some work to do)
try {
Class customEntityExtractor = customExtractorClassLoader.loadClass(customEntity);
IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance();
entity_extractor_mappings.put(obj.getName().toLowerCase(), obj);
customExtractors.put(customEntity, customEntityExtractor);
}
catch (Exception e) {
logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
}
catch(NoClassDefFoundError e) {
logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
}
}
else { // If this object exists and if it's a text extractor, then see if it's also an entity extractor
try {
Class customEntityExtractor = customExtractors.get(customEntity);
IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance();
entity_extractor_mappings.put(obj.getName(), obj);
}
catch (Exception e) {
logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity +": " + e.getMessage(), e);
}
catch(NoClassDefFoundError e) {
logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity +": " + e.getMessage(), e);
}
}
}
}//TESTED
}
try {
entity_extractor_mappings.put("opencalais", new ExtractorOpenCalais());
}
catch (Exception e) {
logger.warn("Can't use OpenCalais as entity extractor: " + e.getMessage());
}
try {
entity_extractor_mappings.put("textrank", new TextRankExtractor());
}
catch (Exception e) {
logger.warn("Can't use textrank as entity extractor: " + e.getMessage());
}
try {
ExtractorAlchemyAPI both = new ExtractorAlchemyAPI();
entity_extractor_mappings.put("alchemyapi", both);
text_extractor_mappings.put("alchemyapi", both);
ExtractorAlchemyAPI_Metadata both_metadata = new ExtractorAlchemyAPI_Metadata();
entity_extractor_mappings.put("alchemyapi-metadata", both_metadata);
text_extractor_mappings.put("alchemyapi-metadata", both_metadata);
}
catch (Exception e) {
logger.warn("Can't use AlchemyAPI as entity/text extractor: " + e.getMessage());
}
try {
text_extractor_mappings.put("boilerpipe", new TextExtractorBoilerpipe());
}
catch (Exception e) {
logger.warn("Can't use Boilerpipe as text extractor: " + e.getMessage());
}
try {
text_extractor_mappings.put("tika", new TextExtractorTika());
}
catch (Exception e) {
logger.warn("Can't use Tika as text extractor: " + e.getMessage());
}
if (null != pm.getDefaultEntityExtractor()) {
default_entity_extractor = entity_extractor_mappings.get(pm.getDefaultEntityExtractor().toLowerCase());
}
else {
default_entity_extractor = null;
}
if (null != pm.getDefaultTextExtractor()) {
default_text_extractor = text_extractor_mappings.get(pm.getDefaultTextExtractor().toLowerCase());
}
else {
try {
default_text_extractor = new TextExtractorBoilerpipe();
}
catch (Exception e) {
logger.warn("Can't use BoilerPlate as default text extractor: " + e.getMessage());
}
}
nBetweenFeedDocs_ms = props.getWebCrawlWaitTime();
// Set up security manager - basically always needed so might as well create here
_securityManager = new IkanowSecurityManager();
}