Package org.apache.any23.extractor.html.HTMLDocument

Examples of org.apache.any23.extractor.html.HTMLDocument.TextField


    throws IOException, ExtractionException, TripleHandlerException {
        final URI uri = RDFUtils.uri("http://host.com/test-malformed-literal.turtle");
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final TripleHandler th = new RDFXMLWriter(baos);
        final ExtractionContext extractionContext = new ExtractionContext("turtle-extractor", uri);
        final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, th);
        extractor.setStopAtFirstError(false);
        try {
            extractor.run(
                    ExtractionParameters.newDefault(),
                    extractionContext,
                    this.getClass().getResourceAsStream("/org/apache/any23/extractor/rdf/testMalformedLiteral"),
                    result
            );
        } finally {
            logger.debug(baos.toString());
            th.close();
            result.close();
        }
    }
View Full Code Here


     * @throws InstantiationException
     * @throws IllegalAccessException
     */
    @Test
    public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException {
        final ExtractorGroup extractorGroup = manager.getApplicableExtractors(
                new ExtractorRegistryImpl(),
                HTML_SCRAPER_TARGET_DIR,  // Required to satisfy class dependencies.
                HTML_SCRAPER_DEPENDENCY_DIR,
                OFFICE_SCRAPER_TARGET_DIR
, OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies.
        );
        assertEquals("Did not find the number of expected extractors", NUM_OF_EXTRACTORS ,        // HTMLScraper Plugin, OfficeScraper Plugin.
                extractorGroup.getNumOfExtractors()
        );
    }
View Full Code Here

     * @throws IllegalAccessException
     */
    @Test
    public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException {
        final ExtractorGroup extractorGroup = manager.getApplicableExtractors(
                new ExtractorRegistryImpl(),
                HTML_SCRAPER_TARGET_DIR,  // Required to satisfy class dependencies.
                HTML_SCRAPER_DEPENDENCY_DIR,
                OFFICE_SCRAPER_TARGET_DIR
, OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies.
        );
View Full Code Here

     * @throws IOException
     */
    // TODO: MimeType detector to null forces the execution of all extractors, but extraction
    //       tests should be based on mimetype detection.
    protected void extract(String resource) throws ExtractionException, IOException {
        SingleDocumentExtraction ex = new SingleDocumentExtraction(
            new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseURI.toString()),
            getExtractorFactory(), new RepositoryWriter(conn)
        );
        ex.setMIMETypeDetector(null);
        report = ex.run();
    }
View Full Code Here

        return true;
    }

    private void addUid(HTMLDocument compoNode, Resource evt) {
        TextField url = compoNode.getSingularUrlField("uid");
        conditionallyAddStringProperty(
                compoNode.getDocument(),
                evt, vICAL.uid, url.value()
        );
    }
View Full Code Here

                evt, vICAL.uid, url.value()
        );
    }

    private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
        TextField url = compoNode.getSingularUrlField("url");
        if ("".equals(url.value())) return;
        addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
    }
View Full Code Here

    private void addRRule(HTMLDocument compoNode, Resource evt) {
        for (Node rule : compoNode.findAllByClassName("rrule")) {
            BNode rrule = valueFactory.createBNode();
            addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
            TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
            conditionallyAddStringProperty(
                    freq.source(),
                    rrule, vICAL.freq, freq.value()
            );
            addBNodeProperty(
                    rule,
                    evt, vICAL.rrule, rrule
            );
View Full Code Here

    private void addOrganizer(HTMLDocument compoNode, Resource evt) {
        for (Node organizer : compoNode.findAllByClassName("organizer")) {
            //untyped
            BNode blank = valueFactory.createBNode();
            TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
            conditionallyAddStringProperty(
                    compoNode.getDocument(),
                    blank, vICAL.calAddress, mail.value()
            );
            addBNodeProperty(
                    organizer,
                    evt, vICAL.organizer, blank
            );
View Full Code Here

            // we have to remove the field soon to avoid infinite loops
            // no null check, we know it's there or we won't be in the loop
            current.getAttributes().removeNamedItem("class");
            ArrayList<TextField> res = new ArrayList<TextField>();
            HTMLDocument.readUrlField(res, current);
            TextField id = res.get(0);
            if (null == id)
                continue;
            TextField refId = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
            Node included = document.findNodeById(refId.value());
            if (null == included)
                continue;
            if( DomUtils.isAncestorOf(included, current) )  {
                final int[] nodeLocation = DomUtils.getNodeLocation(current);
                report.notifyIssue(
View Full Code Here

        }
        return found;
    }

    private boolean addUid(Resource card) {
        TextField uid = fragment.getSingularUrlField("uid");
        return conditionallyAddStringProperty(
                fragment.getDocument(),
                card, vCARD.uid, uid.value()
        );
    }
View Full Code Here

TOP

Related Classes of org.apache.any23.extractor.html.HTMLDocument.TextField

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.