if (isUsingDefaultFieldExtractor()) {
extractedFields.addAll(defaultFieldExtractions());
}
if (this.classToGet != null) {
Source source = new Source(url);
source.fullSequentialParse();
List<Element> elements = source.getAllElementsByClass(classToGet);
String text = elements.get(0).toString();
String[] fields = text.split("<br>");
log.debug("fields: {}", fields);
for (String field : fields) {
Source fieldSource = new Source(field);
field = fieldSource.getTextExtractor().toString();
String[] fieldParts = field.split(":");
log.debug("{} : {}", fieldParts[0], fieldParts[1]);
extractedFields.add(new ScrapedField(fieldParts[0], fieldParts[1]));
}
}
Source source = new Source(url);
for (TagOccurrence tagOccurrence : this.tagsToGet) {
// log.debug("extracting fields using tag: {}", tagOccurrence);
source.fullSequentialParse();
if (!(tagOccurrence.getTag().contains(HTMLElementName.TABLE) || tagOccurrence.getTag().contains(
HTMLElementName.A))) {
throw new IllegalStateException(MessageFormat.format(
"Asked to extract tag: {0}, only know how to extract fields from tables.",
tagOccurrence.getTag()));
} else {
if (isAttemptingToMatchSpecificTable(tagOccurrence)) {
source = new Source(extractTagText(source.toString(), tagOccurrence));
extractedFields.addAll(extractFieldsFromTable(source.toString()));
} else if (tagOccurrence.getTag().equals(HTMLElementName.TABLE)) {
extractedFields.addAll(extractFieldsFromTable(source.toString()));
} else {
extractedFields = extractLinksFromList(source.toString());
}
}
}
source = new Source(url);
source.fullSequentialParse();
if (this.afterTagOccurrence != null) {
source = pruneFrom(source, afterTagOccurrence);
}
for (FieldToGet fieldToGet : fieldsToGet) {
String value = "";
if (fieldToGet.getSearchType() == FieldSearchType.Tag) {
value = source.getAllElements(fieldToGet.getLabel()).get(0).getTextExtractor().toString();
}
extractedFields.add(new ScrapedField(fieldToGet.getFieldname(), value));
}
for (PairedTags tagPair : this.fieldPairs) {
List<Element> labels = source.getAllElements(tagPair.getLabelTag());
List<Element> fields = source.getAllElements(tagPair.getFieldTag());
removeInvalidFields(fields);
int fieldCount = Math.min(labels.size(), fields.size());
for (int i = 0; i < fieldCount; i++) {