Package edu.stanford.nlp.io

Examples of edu.stanford.nlp.io.NumberRangeFileFilter


          trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs >= 3) {
          try {
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams
        encoding = args[argIndex + 1];
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
        // load the parser from a binary serialized file
        // the next argument must be the path to the parser file
        serializedInputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
        // doesn't make sense to load from TextFile -pichuan
        //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
        //        // load the parser from declarative text file
        //        // the next argument must be the path to the parser file
        //        textInputFileOrUrl = args[argIndex + 1];
        //        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
        saveToSerializedFile = true;
        serializedOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
        // save the parser to declarative text file
        saveToTextFile = true;
        textOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-treebank")) {
        // the next argument is the treebank path and range for testing
        int numSubArgs = numSubArgs(args, argIndex);
        argIndex++;
        if (numSubArgs == 1) {
          testFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs > 1) {
          testPath = args[argIndex++];
          if (numSubArgs == 2) {
            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
          } else if (numSubArgs >= 3) {
            try {
              int low = Integer.parseInt(args[argIndex]);
              int high = Integer.parseInt(args[argIndex + 1]);
              testFilter = new NumberRangeFileFilter(low, high, true);
              argIndex += 2;
            } catch (NumberFormatException e) {
              // maybe it's a ranges expression?
              testFilter = new NumberRangesFileFilter(args[argIndex++], true);
            }
          }
        }
      } else {
        int j = op.tlpParams.setOptionFlag(args, argIndex);
        if (j == argIndex) {
          System.err.println("Unknown option ignored: " + args[argIndex]);
          j++;
        }
        argIndex = j;
      }
    } // end while loop through arguments

    TreebankLangParserParams tlpParams = op.tlpParams;

    // all other arguments are order dependent and
    // are processed in order below

    ChineseLexiconAndWordSegmenter cs = null;
    if (!train && op.testOptions.verbose) {
      System.out.println("Currently " + new Date());
      printArgs(args, System.out);
    }
    if (train) {
      printArgs(args, System.out);
      // so we train a parser using the treebank
      if (treebankPath == null) {
        // the next arg must be the treebank path, since it wasn't give earlier
        treebankPath = args[argIndex];
        argIndex++;
        if (args.length > argIndex + 1) {
          try {
            // the next two args might be the range
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      }
      Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
      Index<String> wordIndex = new HashIndex<String>();
      Index<String> tagIndex = new HashIndex<String>();
      cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
    } else if (textInputFileOrUrl != null) {
      // so we load the segmenter from a text grammar file
      // XXXXX fix later -pichuan
      //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
    } else {
      // so we load a serialized segmenter
      if (serializedInputFileOrUrl == null) {
        // the next argument must be the path to the serialized parser
        serializedInputFileOrUrl = args[argIndex];
        argIndex++;
      }
      try {
        cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
      } catch (IllegalArgumentException e) {
        System.err.println("Error loading segmenter, exiting...");
        System.exit(0);
      }
    }

    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    TreePrint treePrint = op.testOptions.treePrint(tlpParams);

    if (testFilter != null) {
      if (testPath == null) {
        if (treebankPath == null) {
          throw new RuntimeException("No test treebank path specified...");
        } else {
          System.err.println("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
          testPath = treebankPath;
        }
      }
      testTreebank = tlpParams.testMemoryTreebank();
      testTreebank.loadPath(testPath, testFilter);
    }

    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));

    // at this point we should be sure that op.tlpParams is
    // set appropriately (from command line, or from grammar file),
    // and will never change again.  We also set the tlpParams of the
    // LexicalizedParser instance to be the same object.  This is
    // redundancy that we probably should take out eventually.
    //
    // -- Roger
    if (op.testOptions.verbose) {
      System.err.println("Lexicon is " + cs.getClass().getName());
    }

    PrintWriter pwOut = tlpParams.pw();
    PrintWriter pwErr = tlpParams.pw(System.err);


    // Now what do we do with the parser we've made
    if (saveToTextFile) {
      // save the parser to textGrammar format
      if (textOutputFileOrUrl != null) {
        saveSegmenterDataToText(cs, textOutputFileOrUrl);
      } else {
        System.err.println("Usage: must specify a text segmenter data output path");
      }
    }
    if (saveToSerializedFile) {
      if (serializedOutputFileOrUrl == null && argIndex < args.length) {
        // the next argument must be the path to serialize to
        serializedOutputFileOrUrl = args[argIndex];
        argIndex++;
      }
      if (serializedOutputFileOrUrl != null) {
        saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
      } else if (textOutputFileOrUrl == null && testTreebank == null) {
        // no saving/parsing request has been specified
        System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
      }
    }
    /* --------------------- Testing part!!!! ----------------------- */
    if (op.testOptions.verbose) {
//      printOptions(false, op);
    }
    if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
      // test parser on treebank
      if (testTreebank == null) {
        // the next argument is the treebank path and range for testing
        testTreebank = tlpParams.testMemoryTreebank();
        if (args.length < argIndex + 4) {
          testTreebank.loadPath(args[argIndex + 1]);
        } else {
          int testlow = Integer.parseInt(args[argIndex + 2]);
          int testhigh = Integer.parseInt(args[argIndex + 3]);
          testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
        }
      }
      /* TODO - test segmenting on treebank. -pichuan */
//      lp.testOnTreebank(testTreebank);
//    } else if (argIndex >= args.length) {
View Full Code Here


        if (numSubArgs == 2) {
          trainFilter = new NumberRangesFileFilter(args[i++], true);
        } else if (numSubArgs >= 3) {
          int low = Integer.parseInt(args[i]);
          int high = Integer.parseInt(args[i + 1]);
          trainFilter = new NumberRangeFileFilter(low, high, true);
          i += 2;
        }
      } else {
        i = op.setOption(args, i);
      }
View Full Code Here

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank(); // this is a new one
    TreebankLanguagePack tlp = op.langpack();

    trainTreebank.loadPath(path, new NumberRangeFileFilter(low, high, true));

    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.selectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
    }
    if (op.trainOptions.selectivePostSplit) {
View Full Code Here

    // these for testing against the markov 3rd order baseline

    // use the parser constructor to extract the grammars from the treebank
    op = new Options();
    LexicalizedParser lp = LexicalizedParser.trainFromTreebank(path, new NumberRangeFileFilter(trainLow, trainHigh, true), op);

    // compact grammars
    if (compactor != null) {

      // extract a bunch of paths
      Timing.startTime();
      System.out.print("Extracting other paths...");
      allTrainPaths = extractPaths(path, trainLow, trainHigh, true);
      allTestPaths = extractPaths(path, testLow, testHigh, true);
      Timing.tick("done");

      // compact grammars
      Timing.startTime();
      System.out.print("Compacting grammars...");
      Pair<UnaryGrammar, BinaryGrammar> grammar = Generics.newPair(lp.ug, lp.bg);
      Triple<Index<String>, UnaryGrammar, BinaryGrammar> compactedGrammar = compactor.compactGrammar(grammar, allTrainPaths, allTestPaths, lp.stateIndex);
      lp.stateIndex = compactedGrammar.first();
      lp.ug = compactedGrammar.second();
      lp.bg = compactedGrammar.third();

      Timing.tick("done.");
    }

    if (asciiOutputPath != null) {
      lp.saveParserToTextFile(asciiOutputPath);
    }

    // test it
    Treebank testTreebank = op.tlpParams.testMemoryTreebank();
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    System.out.println("Currently " + new Date());
    EvaluateTreebank evaluator = new EvaluateTreebank(lp);
    evaluator.testOnTreebank(testTreebank);
    System.out.println("Currently " + new Date());
  }
View Full Code Here

  }


  public static List<Tree> getTrees(String path, int low, int high, int minLength, int maxLength) {
    Treebank treebank = new DiskTreebank(in -> new PennTreeReader(in, new LabeledScoredTreeFactory(new WordFactory()), new BobChrisTreeNormalizer()));
    treebank.loadPath(path, new NumberRangeFileFilter(low, high, true));
    List<Tree> trees = new ArrayList<Tree>();
    for (Tree tree : treebank) {
      if (tree.yield().size() <= maxLength && tree.yield().size() >= minLength) {
        trees.add(tree);
      }
View Full Code Here

        filter = new NumberRangesFileFilter(args[argIndex++], true);
      } else if (numSubArgs == 3) {
        try {
          int low = Integer.parseInt(args[argIndex]);
          int high = Integer.parseInt(args[argIndex + 1]);
          filter = new NumberRangeFileFilter(low, high, true);
          argIndex += 2;
        } catch (NumberFormatException e) {
          // maybe it's a ranges expression?
          filter = new NumberRangesFileFilter(args[argIndex++], true);
        }
View Full Code Here

    LexicalizedParser pd = LexicalizedParser.loadModel(args[0]);
    op = pd.getOp(); // in case a serialized options was read in
    Treebank testTreebank = op.tlpParams.memoryTreebank();
    int testlow = Integer.parseInt(args[2]);
    int testhigh = Integer.parseInt(args[3]);
    testTreebank.loadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true));
    op.setOptionsOrWarn(args, 4, args.length);
    testOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex);
  }
View Full Code Here

   * Treebank was available, and the pre-stored list was being used).
   */
  public static Set<String> getEnglishSplitCategories(String treebankRoot) {
    TreebankLangParserParams tlpParams = new EnglishTreebankParserParams();
    Treebank trees = tlpParams.memoryTreebank();
    trees.loadPath(treebankRoot, new NumberRangeFileFilter(200, 2199, true));
    return getSplitCategories(trees, 300.0, tlpParams.treebankLanguagePack());
  }
View Full Code Here

    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");

    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.io.NumberRangeFileFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.