Examples of SimpleParser


Examples of bixo.parser.SimpleParser

        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);
       
        // Take content and split it into content output plus parse to extract URLs.
        SimpleParser parser;
        if (options.isUseBoilerpipe()) {
            parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy());
        } else if (options.isGenerateHTML()) {
            parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true);
        } else {
            parser = new SimpleParser();
        }
       
        parser.setExtractLanguage(false);
        ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

       
        // Create the output map that connects each tail pipe to the appropriate sink, and the
        // list of tail pipes.
View Full Code Here

Examples of bixo.parser.SimpleParser

        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, new FirefoxUserAgent());
        fetcher.setMaxRetryCount(options.getMaxRetries());
       
        // Give a long timeout for parsing
        ParserPolicy parserPolicy = new ParserPolicy(MAX_PARSE_DURATION);
        SimpleParser parser = new SimpleParser(parserPolicy);

        SimpleParser rawParser = new SimpleParser(parserPolicy, true);
       
        // Create Boilperpipe content extractor
        SimpleParser bpParser = new SimpleParser(new BoilerpipeContentExtractor(), new NullLinkExtractor(), parserPolicy);
       
        if (options.isTraceLogging()) {
            Logger.getRootLogger().setLevel(Level.TRACE);
            System.setProperty("bixo.root.level", "TRACE");
        }
       
        String urls[] = options.getUrls() == null ? null : options.getUrls().split(",");
        boolean interactive = (urls == null);
        int index = 0;
       
        while (interactive || (index < urls.length)) {
          String url;
         
          try {
              if (interactive) {
                System.out.print("URL to fetch: ");
                url = readInputLine();
                if (url.length() == 0) {
                  System.exit(0);
                }
              } else {
                url = args[index++];
              }

              System.out.println("Fetching " + url);
            FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
            System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
            System.out.flush();
           
            // System.out.println("Result = " + result.toString());
            ParsedDatum parsed = parser.parse(result);
            System.out.println(String.format("Parsed %s: lang = %s, size = %d", parsed.getUrl(),
                            parsed.getLanguage(), parsed.getParsedText().length()));
           
            ParsedDatum bpParsed = bpParser.parse(result);
            ParsedDatum rawParsed = rawParser.parse(result);
           
            if (interactive) {
                while (true) {
                    System.out.print("Next action - (d)ump regular, dump (b)oilerpipe, dump (r)aw, (e)xit: ");
View Full Code Here

Examples of bixo.parser.SimpleParser

        BixoPlatform platform = new BixoPlatform(ParsePipeTest.class, Platform.Local);
       

        Pipe pipe = new Pipe("parse_source");
        ParsePipe parserPipe = new ParsePipe(pipe, new SimpleParser());
        BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);
View Full Code Here

Examples of bixo.parser.SimpleParser

        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);

        // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);
       
        Pipe analyzerPipe = new Pipe("analyzer pipe");
        analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());
       
View Full Code Here

Examples of bixo.parser.SimpleParser

            super.cleanup(flowProcess, operationCall);
        }
    }

    public ParsePipe(Pipe fetcherPipe) {
        this(fetcherPipe, new SimpleParser());
    }
View Full Code Here

Examples of com.simpleplugin.parser.SimpleParser

        return TokenSet.EMPTY;
    }

    @NotNull
    public PsiParser createParser(final Project project) {
        return new SimpleParser();
    }
View Full Code Here

Examples of dtool.tests.utils.SimpleParser

 
  protected static final String[] splitKeywords = { "#:HEADER", "Ⓗ", "#:SPLIT", "━━", "▂▂", "▃▃"};
 
  public static boolean isTSPSourceStart(Reader reader) throws IOException {
    String sourceIntro = new String(StreamUtil.readCharAmountFromReader(reader, 10));
    SimpleParser parser = new SimpleParser(sourceIntro);
    return parser.tryConsume(splitKeywords) > 0;
  }
View Full Code Here

Examples of dtool.tests.utils.SimpleParser

    SimpleParser parser = new SimpleParser(sourceIntro);
    return parser.tryConsume(splitKeywords) > 0;
  }
 
  public void splitSourceCases(String defaultMarker, String fileSource) throws TemplatedSourceException {
    SimpleParser parser = new SimpleParser(fileSource);
   
    do {
      boolean isHeader = false;
      String keyMarker = defaultMarker;
     
      int alt = parser.tryConsume(splitKeywords);
      if(alt != SimpleParser.EOF) {
        if(alt == 0 || alt == 1) {
          isHeader = true;
        }
        if(parser.seekToNewLine() == false) {
          handleError(new TemplatedSourceException(parser.getSourcePosition()));
        }
        Matcher matcher = Pattern.compile("→(.)").matcher(parser.getLastConsumedString());
        if(matcher.find()) {
          keyMarker = matcher.group(1);
        }
      } else {
        assertTrue(parser.getSourcePosition() == 0);
      }
     
      parser.consumeUntilAny(splitKeywords);
     
      String unprocessedCaseSource = parser.getLastConsumedString();
      processSplitCaseSource(unprocessedCaseSource, isHeader, keyMarker);
    } while(!parser.lookaheadIsEOF());
  }
View Full Code Here

Examples of dtool.tests.utils.SimpleParser

  }
 
  protected ArrayList<TspElement> parseSource(String unprocessedSource) throws TemplatedSourceException {
    ArrayList<TspElement> unprocessedSourceElements = new ArrayList<TspElement>();
   
    SimpleParser parser = new SimpleParser(unprocessedSource);
    while(true) {
      TspElement tspElem = parseElement(parser);
      if(tspElem == null) {
        break;
      }
View Full Code Here

Examples of dtool.tests.utils.SimpleParser

    }
    throw assertFail();
  }
 
  protected NamedNodeElement[] parseExpectedStructure(String source) {
    SimpleParser parser = new SimpleParser(source);
    NamedNodeElement[] namedElements = readNamedElementsList(parser);
    assertTrue(parser.lookaheadIsEOF() || parser.lookAhead() == '$');
    return namedElements;
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.