Package org.apache.any23.extractor.html.HTMLDocument

Examples of org.apache.any23.extractor.html.HTMLDocument.TextField


        conditionallyAddStringProperty(value.source(), rev, vREVIEWAGG.votes,
                value.value());
    }

    private void addSummary(HTMLDocument doc, Resource rev) {
        TextField value = doc.getSingularTextField("summary");
        conditionallyAddStringProperty(value.source(), rev, vREVIEW.title,
                value.value());
    }
View Full Code Here


        return true;
    }

    private void addUid(HTMLDocument compoNode, Resource evt) {
        TextField url = compoNode.getSingularUrlField("uid");
        conditionallyAddStringProperty(
                compoNode.getDocument(),
                evt, vICAL.uid, url.value()
        );
    }
View Full Code Here

                evt, vICAL.uid, url.value()
        );
    }

    private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
        TextField url = compoNode.getSingularUrlField("url");
        if ("".equals(url.value())) return;
        addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
    }
View Full Code Here

    private void addRRule(HTMLDocument compoNode, Resource evt) {
        for (Node rule : compoNode.findAllByClassName("rrule")) {
            BNode rrule = valueFactory.createBNode();
            addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
            TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
            conditionallyAddStringProperty(
                    freq.source(),
                    rrule, vICAL.freq, freq.value()
            );
            addBNodeProperty(
                    rule,
                    evt, vICAL.rrule, rrule
            );
View Full Code Here

    private void addOrganizer(HTMLDocument compoNode, Resource evt) {
        for (Node organizer : compoNode.findAllByClassName("organizer")) {
            //untyped
            BNode blank = valueFactory.createBNode();
            TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
            conditionallyAddStringProperty(
                    compoNode.getDocument(),
                    blank, vICAL.calAddress, mail.value()
            );
            addBNodeProperty(
                    organizer,
                    evt, vICAL.organizer, blank
            );
View Full Code Here

        final ModifiableConfiguration configuration = DefaultConfiguration.copy();
        configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
        SingleDocumentExtraction instance =  new SingleDocumentExtraction(
                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
        );
        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
        return instance;
View Full Code Here

    private Extractor extractor;
    private TripleHandler mockTripleHandler;

    @Before
    public void setUp() {
        extractor = new TitleExtractor();
        mockTripleHandler = Mockito.mock(TripleHandler.class);
        extractionResult  = new ExtractionResultImpl(
                new ExtractionContext("test-extractor-name", TEST_URI),
                extractor,
                mockTripleHandler
View Full Code Here

        }
        FormatWriter fw = factory.getRdfWriter(byteOutStream);
        fw.setAnnotated(annotate);
        outputMediaType = factory.getMimeType();
        List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>();
        tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
        tripleHandlers.add(new CountingTripleHandler());
        rdfWriter = new CompositeTripleHandler(tripleHandlers);
        reporter = new ReportingTripleHandler(rdfWriter);
        rdfWriter = new IgnoreAccidentalRDFa(
            new IgnoreTitlesOfEmptyDocuments(reporter),
            true    // suppress stylesheet triples.
        );
        return true;
    }
View Full Code Here

        tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
        tripleHandlers.add(new CountingTripleHandler());
        rdfWriter = new CompositeTripleHandler(tripleHandlers);
        reporter = new ReportingTripleHandler(rdfWriter);
        rdfWriter = new IgnoreAccidentalRDFa(
            new IgnoreTitlesOfEmptyDocuments(reporter),
            true    // suppress stylesheet triples.
        );
        return true;
    }
View Full Code Here

                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
        );
        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
        return instance;
    }
View Full Code Here

TOP

Related Classes of org.apache.any23.extractor.html.HTMLDocument.TextField

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.