Examples of edu.stanford.nlp.io.NumberRangeFileFilter

edu.stanford.nlp.io.NumberRangeFileFilter
Implements a file filter that examines a number in a filename to determine acceptance. This is useful for wanting to process ranges of numbered files in collections where each file has some name, part of which is alphabetic and constant, and part of which is numeric. The test is evaluated based on the rightmost natural number found in the filename string. (It only looks in the final filename, not in other components of the path.) Number ranges are inclusive. @author Christopher Manning @version 2000/12/29

          trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs >= 3) {
          try {
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams
        encoding = args[argIndex + 1];
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
        // load the parser from a binary serialized file
        // the next argument must be the path to the parser file
        serializedInputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
        // doesn't make sense to load from TextFile -pichuan
        //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
        //        // load the parser from declarative text file
        //        // the next argument must be the path to the parser file
        //        textInputFileOrUrl = args[argIndex + 1];
        //        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
        saveToSerializedFile = true;
        serializedOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
        // save the parser to declarative text file
        saveToTextFile = true;
        textOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-treebank")) {
        // the next argument is the treebank path and range for testing
        int numSubArgs = numSubArgs(args, argIndex);
        argIndex++;
        if (numSubArgs == 1) {
          testFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs > 1) {
          testPath = args[argIndex++];
          if (numSubArgs == 2) {
            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
          } else if (numSubArgs >= 3) {
            try {
              int low = Integer.parseInt(args[argIndex]);
              int high = Integer.parseInt(args[argIndex + 1]);
              testFilter = new NumberRangeFileFilter(low, high, true);
              argIndex += 2;
            } catch (NumberFormatException e) {
              // maybe it's a ranges expression?
              testFilter = new NumberRangesFileFilter(args[argIndex++], true);
            }
          }
        }
      } else {
        int j = op.tlpParams.setOptionFlag(args, argIndex);
        if (j == argIndex) {
          System.err.println("Unknown option ignored: " + args[argIndex]);
          j++;
        }
        argIndex = j;
      }
    } // end while loop through arguments


    TreebankLangParserParams tlpParams = op.tlpParams;


    // all other arguments are order dependent and
    // are processed in order below


    ChineseLexiconAndWordSegmenter cs = null;
    if (!train && op.testOptions.verbose) {
      System.out.println("Currently " + new Date());
      printArgs(args, System.out);
    }
    if (train) {
      printArgs(args, System.out);
      // so we train a parser using the treebank
      if (treebankPath == null) {
        // the next arg must be the treebank path, since it wasn't give earlier
        treebankPath = args[argIndex];
        argIndex++;
        if (args.length > argIndex + 1) {
          try {
            // the next two args might be the range
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      }
      Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
      Index<String> wordIndex = new HashIndex<String>();
      Index<String> tagIndex = new HashIndex<String>();
      cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
    } else if (textInputFileOrUrl != null) {
      // so we load the segmenter from a text grammar file
      // XXXXX fix later -pichuan
      //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
    } else {
      // so we load a serialized segmenter
      if (serializedInputFileOrUrl == null) {
        // the next argument must be the path to the serialized parser
        serializedInputFileOrUrl = args[argIndex];
        argIndex++;
      }
      try {
        cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
      } catch (IllegalArgumentException e) {
        System.err.println("Error loading segmenter, exiting...");
        System.exit(0);
      }
    }


    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    TreePrint treePrint = op.testOptions.treePrint(tlpParams);


    if (testFilter != null) {
      if (testPath == null) {
        if (treebankPath == null) {
          throw new RuntimeException("No test treebank path specified...");
        } else {
          System.err.println("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
          testPath = treebankPath;
        }
      }
      testTreebank = tlpParams.testMemoryTreebank();
      testTreebank.loadPath(testPath, testFilter);
    }


    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));


    // at this point we should be sure that op.tlpParams is
    // set appropriately (from command line, or from grammar file),
    // and will never change again.  We also set the tlpParams of the
    // LexicalizedParser instance to be the same object.  This is
    // redundancy that we probably should take out eventually.
    //
    // -- Roger
    if (op.testOptions.verbose) {
      System.err.println("Lexicon is " + cs.getClass().getName());
    }


    PrintWriter pwOut = tlpParams.pw();
    PrintWriter pwErr = tlpParams.pw(System.err);




    // Now what do we do with the parser we've made
    if (saveToTextFile) {
      // save the parser to textGrammar format
      if (textOutputFileOrUrl != null) {
        saveSegmenterDataToText(cs, textOutputFileOrUrl);
      } else {
        System.err.println("Usage: must specify a text segmenter data output path");
      }
    }
    if (saveToSerializedFile) {
      if (serializedOutputFileOrUrl == null && argIndex < args.length) {
        // the next argument must be the path to serialize to
        serializedOutputFileOrUrl = args[argIndex];
        argIndex++;
      }
      if (serializedOutputFileOrUrl != null) {
        saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
      } else if (textOutputFileOrUrl == null && testTreebank == null) {
        // no saving/parsing request has been specified
        System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
      }
    }
    /* --------------------- Testing part!!!! ----------------------- */
    if (op.testOptions.verbose) {
//      printOptions(false, op);
    }
    if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
      // test parser on treebank
      if (testTreebank == null) {
        // the next argument is the treebank path and range for testing
        testTreebank = tlpParams.testMemoryTreebank();
        if (args.length < argIndex + 4) {
          testTreebank.loadPath(args[argIndex + 1]);
        } else {
          int testlow = Integer.parseInt(args[argIndex + 2]);
          int testhigh = Integer.parseInt(args[argIndex + 3]);
          testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
        }
      }
      /* TODO - test segmenting on treebank. -pichuan */
//      lp.testOnTreebank(testTreebank);
//    } else if (argIndex >= args.length) {

View Full Code Here

        if (numSubArgs == 2) {
          trainFilter = new NumberRangesFileFilter(args[i++], true);
        } else if (numSubArgs >= 3) {
          int low = Integer.parseInt(args[i]);
          int high = Integer.parseInt(args[i + 1]);
          trainFilter = new NumberRangeFileFilter(low, high, true);
          i += 2;
        }
      } else {
        i = op.setOption(args, i);
      }

View Full Code Here


    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank(); // this is a new one
    TreebankLanguagePack tlp = op.langpack();


    trainTreebank.loadPath(path, new NumberRangeFileFilter(low, high, true));


    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.selectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
    }
    if (op.trainOptions.selectivePostSplit) {

View Full Code Here


    // these for testing against the markov 3rd order baseline


    // use the parser constructor to extract the grammars from the treebank
    op = new Options();
    LexicalizedParser lp = LexicalizedParser.trainFromTreebank(path, new NumberRangeFileFilter(trainLow, trainHigh, true), op);


    // compact grammars
    if (compactor != null) {


      // extract a bunch of paths
      Timing.startTime();
      System.out.print("Extracting other paths...");
      allTrainPaths = extractPaths(path, trainLow, trainHigh, true);
      allTestPaths = extractPaths(path, testLow, testHigh, true);
      Timing.tick("done");


      // compact grammars
      Timing.startTime();
      System.out.print("Compacting grammars...");
      Pair<UnaryGrammar, BinaryGrammar> grammar = Generics.newPair(lp.ug, lp.bg);
      Triple<Index<String>, UnaryGrammar, BinaryGrammar> compactedGrammar = compactor.compactGrammar(grammar, allTrainPaths, allTestPaths, lp.stateIndex);
      lp.stateIndex = compactedGrammar.first();
      lp.ug = compactedGrammar.second();
      lp.bg = compactedGrammar.third();


      Timing.tick("done.");
    }


    if (asciiOutputPath != null) {
      lp.saveParserToTextFile(asciiOutputPath);
    }


    // test it
    Treebank testTreebank = op.tlpParams.testMemoryTreebank();
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    System.out.println("Currently " + new Date());
    EvaluateTreebank evaluator = new EvaluateTreebank(lp);
    evaluator.testOnTreebank(testTreebank);
    System.out.println("Currently " + new Date());
  }

View Full Code Here

  }




  public static List<Tree> getTrees(String path, int low, int high, int minLength, int maxLength) {
    Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new WordFactory()), new BobChrisTreeNormalizer()));
    treebank.loadPath(path, new NumberRangeFileFilter(low, high, true));
    List<Tree> trees = new ArrayList<Tree>();
    for (Tree tree : treebank) {
      if (tree.yield().size() <= maxLength && tree.yield().size() >= minLength) {
        trees.add(tree);
      }

View Full Code Here

        filter = new NumberRangesFileFilter(args[argIndex++], true);
      } else if (numSubArgs == 3) {
        try {
          int low = Integer.parseInt(args[argIndex]);
          int high = Integer.parseInt(args[argIndex + 1]);
          filter = new NumberRangeFileFilter(low, high, true);
          argIndex += 2;
        } catch (NumberFormatException e) {
          // maybe it's a ranges expression?
          filter = new NumberRangesFileFilter(args[argIndex++], true);
        }

View Full Code Here

    LexicalizedParser pd = LexicalizedParser.loadModel(args[0]);
    op = pd.getOp(); // in case a serialized options was read in
    Treebank testTreebank = op.tlpParams.memoryTreebank();
    int testlow = Integer.parseInt(args[2]);
    int testhigh = Integer.parseInt(args[3]);
    testTreebank.loadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true));
    op.setOptionsOrWarn(args, 4, args.length);
    testOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex);
  }

View Full Code Here

   * Treebank was available, and the pre-stored list was being used).
   */
  public static Set<String> getEnglishSplitCategories(String treebankRoot) {
    TreebankLangParserParams tlpParams = new EnglishTreebankParserParams();
    Treebank trees = tlpParams.memoryTreebank();
    trees.loadPath(treebankRoot, new NumberRangeFileFilter(200, 2199, true));
    return getSplitCategories(trees, 300.0, tlpParams.treebankLanguagePack());
  }

View Full Code Here

    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);


    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }


    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");


    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {

View Full Code Here

TOP

Related Classes of edu.stanford.nlp.io.NumberRangeFileFilter

edu.stanford.nlp.parser.common.ArgUtils

edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter

edu.stanford.nlp.parser.lexparser.FactoredParser

edu.stanford.nlp.parser.lexparser.GrammarCompactionTester

edu.stanford.nlp.parser.lexparser.GrammarCoverageChecker

edu.stanford.nlp.parser.lexparser.ParentAnnotationStats

edu.stanford.nlp.parser.lexparser.TreeAnnotatorAndBinarizer

edu.stanford.nlp.parser.lexparser.TreebankAnnotator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.