Package org.apache.any23.configuration

Examples of org.apache.any23.configuration.ModifiableConfiguration


     * @throws InstantiationException
     * @throws IllegalAccessException
     */
    @Test
    public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException {
        final ExtractorGroup extractorGroup = manager.getApplicableExtractors(
                new ExtractorRegistryImpl(),
                HTML_SCRAPER_TARGET_DIR,  // Required to satisfy class dependencies.
                HTML_SCRAPER_DEPENDENCY_DIR,
                OFFICE_SCRAPER_TARGET_DIR
, OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies.
        );
        assertEquals("Did not find the number of expected extractors", NUM_OF_EXTRACTORS ,        // HTMLScraper Plugin, OfficeScraper Plugin.
                extractorGroup.getNumOfExtractors()
        );
    }
View Full Code Here


            //for (ExtractorFactory<?> extractorFactory : initialExtractorGroup) {
            //    newFactoryList.add(extractorFactory);
            //}

            return new ExtractorGroup(newFactoryList);
        } finally {
            logger.info(report.toString());
        }
    }
View Full Code Here

     * @throws IllegalAccessException
     */
    @Test
    public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException {
        final ExtractorGroup extractorGroup = manager.getApplicableExtractors(
                new ExtractorRegistryImpl(),
                HTML_SCRAPER_TARGET_DIR,  // Required to satisfy class dependencies.
                HTML_SCRAPER_DEPENDENCY_DIR,
                OFFICE_SCRAPER_TARGET_DIR
, OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies.
        );
View Full Code Here

     * @throws IOException
     */
    // TODO: MimeType detector to null forces the execution of all extractors, but extraction
    //       tests should be based on mimetype detection.
    protected void extract(String resource) throws ExtractionException, IOException {
        SingleDocumentExtraction ex = new SingleDocumentExtraction(
            new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseURI.toString()),
            getExtractorFactory(), new RepositoryWriter(conn)
        );
        ex.setMIMETypeDetector(null);
        report = ex.run();
    }
View Full Code Here

        final ModifiableConfiguration configuration = DefaultConfiguration.copy();
        configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
        SingleDocumentExtraction instance =  new SingleDocumentExtraction(
                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
        );
        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
        return instance;
View Full Code Here

    private Extractor extractor;
    private TripleHandler mockTripleHandler;

    @Before
    public void setUp() {
        extractor = new TitleExtractor();
        mockTripleHandler = Mockito.mock(TripleHandler.class);
        extractionResult  = new ExtractionResultImpl(
                new ExtractionContext("test-extractor-name", TEST_URI),
                extractor,
                mockTripleHandler
View Full Code Here

        }
        FormatWriter fw = factory.getRdfWriter(byteOutStream);
        fw.setAnnotated(annotate);
        outputMediaType = factory.getMimeType();
        List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>();
        tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
        tripleHandlers.add(new CountingTripleHandler());
        rdfWriter = new CompositeTripleHandler(tripleHandlers);
        reporter = new ReportingTripleHandler(rdfWriter);
        rdfWriter = new IgnoreAccidentalRDFa(
            new IgnoreTitlesOfEmptyDocuments(reporter),
            true    // suppress stylesheet triples.
        );
        return true;
    }
View Full Code Here

                return 2000;
            }
        });
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        TripleHandler handler = new RDFXMLWriter(byteArrayOutputStream);
        TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
        ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);

        DocumentSource source = getDocumentSourceFromResource(
                    "/html/rdfa/ansa_2010-02-26_12645863.html",
                    "http://host.com/service");
View Full Code Here

     */
    private ExtractionReport detectAndExtract(String in) throws Exception {
        Any23 any23 = new Any23();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        ReportingTripleHandler outputHandler = new ReportingTripleHandler(
                new IgnoreAccidentalRDFa(
                        new IgnoreTitlesOfEmptyDocuments(
                                new NTriplesWriter(out)
                        )
                )
        );
View Full Code Here

        tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
        tripleHandlers.add(new CountingTripleHandler());
        rdfWriter = new CompositeTripleHandler(tripleHandlers);
        reporter = new ReportingTripleHandler(rdfWriter);
        rdfWriter = new IgnoreAccidentalRDFa(
            new IgnoreTitlesOfEmptyDocuments(reporter),
            true    // suppress stylesheet triples.
        );
        return true;
    }
View Full Code Here

TOP

Related Classes of org.apache.any23.configuration.ModifiableConfiguration

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.