Package org.wikiclean

Examples of org.wikiclean.WikiClean


    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build();

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    try {
      WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);

      ExecutorService executor = Executors.newFixedThreadPool(threads);
      int cnt = 0;
      String page;
      while ((page = stream.readNext()) != null) {
        String title = cleaner.getTitle(page);

        // These are heuristic specifically for filtering out non-articles in enwiki-20120104.
        if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
          continue;
        }
View Full Code Here

TOP

Related Classes of org.wikiclean.WikiClean

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.