Package edu.stanford.nlp.io

Examples of edu.stanford.nlp.io.NumberRangesFileFilter


    String encoding = getEncoding(config);
    String tagSeparator = getTagSeparator(config);
    TreeTransformer treeTransformer = null;
    TreeNormalizer treeNormalizer = null;
    TreeReaderFactory trf = null;
    NumberRangesFileFilter treeRange = null;
    Predicate<Tree> treeFilter = null;
    Integer wordColumn = null, tagColumn = null;

    for (String arg : args) {
      String[] argPieces = arg.split("=", 2);
      if (argPieces.length != 2) {
        throw new IllegalArgumentException("TaggedFileRecord argument " + arg +
                                           " has an unexpected number of =s");
      }
      if (argPieces[0].equalsIgnoreCase(FORMAT)) {
        format = Format.valueOf(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(ENCODING)) {
        encoding = argPieces[1];
      } else if (argPieces[0].equalsIgnoreCase(TAG_SEPARATOR)) {
        tagSeparator = argPieces[1];
      } else if (argPieces[0].equalsIgnoreCase(TREE_TRANSFORMER)) {
        treeTransformer = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TREE_NORMALIZER)) {
        treeNormalizer = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TREE_READER)) {
        trf = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TREE_RANGE)) {
        String range = argPieces[1].replaceAll(":", ",");
        treeRange = new NumberRangesFileFilter(range, true);
      } else if (argPieces[0].equalsIgnoreCase(TREE_FILTER)) {
        treeFilter = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(WORD_COLUMN)) {
        wordColumn = Integer.valueOf(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TAG_COLUMN)) {
View Full Code Here


    Options op = new Options(ctpp);

    if (argMap.containsKey("-stats")) {
      String[] statArgs = (argMap.get("-stats"));
      MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
      FileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false);
      rawTrainTreebank.loadPath(new File(statArgs[0]), trainFilt);
      System.err.println("Done reading trees.");
      MemoryTreebank trainTreebank;
      if (argMap.containsKey("-annotate")) {
        trainTreebank = new MemoryTreebank();
        TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
        for (Tree tree : rawTrainTreebank) {
          trainTreebank.add(annotator.transformTree(tree));
        }
        System.err.println("Done annotating trees.");
      } else {
        trainTreebank = rawTrainTreebank;
      }
      printStats(trainTreebank, pw);
      System.exit(0);
    }

    int maxLength = 1000000;
    //    Test.verbose = true;
    if (argMap.containsKey("-norm")) {
      op.testOptions.lengthNormalization = true;
    }
    if (argMap.containsKey("-maxLength")) {
      maxLength = Integer.parseInt((argMap.get("-maxLength"))[0]);
    }
    op.testOptions.maxLength = 120;
    boolean combo = argMap.containsKey("-combo");
    if (combo) {
      ctpp.useCharacterBasedLexicon = true;
      op.testOptions.maxSpanForTags = 10;
      op.doDep = false;
      op.dcTags = false;
    }

    LexicalizedParser lp = null;
    Lexicon lex = null;
    if (argMap.containsKey("-parser")) {
      String[] parserArgs = (argMap.get("-parser"));
      if (parserArgs.length > 1) {
        FileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
        lp = LexicalizedParser.trainFromTreebank(parserArgs[0], trainFilt, op);
        if (parserArgs.length == 3) {
          String filename = parserArgs[2];
          System.err.println("Writing parser in serialized format to file " + filename + " ");
          System.err.flush();
          ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
          out.writeObject(lp);
          out.close();
          System.err.println("done.");
        }
      } else {
        String parserFile = parserArgs[0];
        lp = LexicalizedParser.loadModel(parserFile, op);
      }
      lex = lp.getLexicon();
      op = lp.getOp();
      ctpp = (ChineseTreebankParserParams) op.tlpParams;
    }

    if (argMap.containsKey("-rad")) {
      ctpp.useUnknownCharacterModel = true;
    }

    if (argMap.containsKey("-lengthPenalty")) {
      ctpp.lengthPenalty = Double.parseDouble((argMap.get("-lengthPenalty"))[0]);
    }

    if (argMap.containsKey("-penaltyType")) {
      ctpp.penaltyType = Integer.parseInt((argMap.get("-penaltyType"))[0]);
    }

    if (argMap.containsKey("-lex")) {
      String[] lexArgs = (argMap.get("-lex"));
      if (lexArgs.length > 1) {
        Index<String> wordIndex = new HashIndex<String>();
        Index<String> tagIndex = new HashIndex<String>();
        lex = ctpp.lex(op, wordIndex, tagIndex);
        MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
        FileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false);
        rawTrainTreebank.loadPath(new File(lexArgs[0]), trainFilt);
        System.err.println("Done reading trees.");
        MemoryTreebank trainTreebank;
        if (argMap.containsKey("-annotate")) {
          trainTreebank = new MemoryTreebank();
          TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
          for (Iterator iter = rawTrainTreebank.iterator(); iter.hasNext();) {
            Tree tree = (Tree) iter.next();
            tree = annotator.transformTree(tree);
            trainTreebank.add(tree);
          }
          System.err.println("Done annotating trees.");
        } else {
          trainTreebank = rawTrainTreebank;
        }
        lex.initializeTraining(trainTreebank.size());
        lex.train(trainTreebank);
        lex.finishTraining();
        System.err.println("Done training lexicon.");
        if (lexArgs.length == 3) {
          String filename = lexArgs.length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
          System.err.println("Writing lexicon in serialized format to file " + filename + " ");
          System.err.flush();
          ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
          out.writeObject(lex);
          out.close();
          System.err.println("done.");
        }
      } else {
        String lexFile = lexArgs.length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
        System.err.println("Reading Lexicon from file " + lexFile);
        ObjectInputStream in = IOUtils.readStreamFromString(lexFile);
        try {
          lex = (Lexicon) in.readObject();
        } catch (ClassNotFoundException e) {
          throw new RuntimeException("Bad serialized file: " + lexFile);
        }
        in.close();
      }
    }

    if (argMap.containsKey("-test")) {
      boolean segmentWords = ctpp.segment;
      boolean parse = lp != null;
      assert (parse || segmentWords);
      //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
      //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
      WordSegmenter seg = null;
      if (segmentWords) {
        seg = (WordSegmenter) lex;
      }
      String[] testArgs = (argMap.get("-test"));
      MemoryTreebank testTreebank = op.tlpParams.memoryTreebank();
      FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
      testTreebank.loadPath(new File(testArgs[0]), testFilt);
      TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
      TreeTransformer collinizer = ctpp.collinizer();

      WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
View Full Code Here

    TreebankLanguagePack ctlp = tlpParams.treebankLanguagePack();
    Options op = new Options(tlpParams);
    TreeAnnotator ta = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);

    System.err.println("Reading Trees...");
    FileFilter trainFilter = new NumberRangesFileFilter(args[1], true);
    Treebank trainTreebank = tlpParams.memoryTreebank();
    trainTreebank.loadPath(args[0], trainFilter);

    System.err.println("Annotating trees...");
    Collection<Tree> trainTrees = new ArrayList<Tree>();
    for (Tree tree : trainTreebank) {
      trainTrees.add(ta.transformTree(tree));
    }
    trainTreebank = null; // saves memory

    System.err.println("Training lexicon...");

    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    int featureLevel = DEFAULT_FEATURE_LEVEL;
    if (args.length > 3) {
      featureLevel = Integer.parseInt(args[3]);
    }
    ChineseMaxentLexicon lex = new ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel);
    lex.initializeTraining(trainTrees.size());
    lex.train(trainTrees);
    lex.finishTraining();

    System.err.print("Testing");

    FileFilter testFilter = new NumberRangesFileFilter(args[2], true);
    Treebank testTreebank = tlpParams.memoryTreebank();
    testTreebank.loadPath(args[0], testFilter);
    List<TaggedWord> testWords = new ArrayList<TaggedWord>();
    for (Tree t : testTreebank) {
      for (TaggedWord tw : t.taggedYield()) {
View Full Code Here

          argIndex++;
        } else {
          throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
        }
        if (numSubArgs == 2) {
          trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs >= 3) {
          try {
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      } else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams
        encoding = args[argIndex + 1];
        op.tlpParams.setInputEncoding(encoding);
        op.tlpParams.setOutputEncoding(encoding);
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
        // load the parser from a binary serialized file
        // the next argument must be the path to the parser file
        serializedInputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
        // doesn't make sense to load from TextFile -pichuan
        //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
        //        // load the parser from declarative text file
        //        // the next argument must be the path to the parser file
        //        textInputFileOrUrl = args[argIndex + 1];
        //        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
        saveToSerializedFile = true;
        serializedOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
        // save the parser to declarative text file
        saveToTextFile = true;
        textOutputFileOrUrl = args[argIndex + 1];
        argIndex += 2;
      } else if (args[argIndex].equalsIgnoreCase("-treebank")) {
        // the next argument is the treebank path and range for testing
        int numSubArgs = numSubArgs(args, argIndex);
        argIndex++;
        if (numSubArgs == 1) {
          testFilter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs > 1) {
          testPath = args[argIndex++];
          if (numSubArgs == 2) {
            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
          } else if (numSubArgs >= 3) {
            try {
              int low = Integer.parseInt(args[argIndex]);
              int high = Integer.parseInt(args[argIndex + 1]);
              testFilter = new NumberRangeFileFilter(low, high, true);
              argIndex += 2;
            } catch (NumberFormatException e) {
              // maybe it's a ranges expression?
              testFilter = new NumberRangesFileFilter(args[argIndex++], true);
            }
          }
        }
      } else {
        int j = op.tlpParams.setOptionFlag(args, argIndex);
        if (j == argIndex) {
          System.err.println("Unknown option ignored: " + args[argIndex]);
          j++;
        }
        argIndex = j;
      }
    } // end while loop through arguments

    TreebankLangParserParams tlpParams = op.tlpParams;

    // all other arguments are order dependent and
    // are processed in order below

    ChineseLexiconAndWordSegmenter cs = null;
    if (!train && op.testOptions.verbose) {
      System.out.println("Currently " + new Date());
      printArgs(args, System.out);
    }
    if (train) {
      printArgs(args, System.out);
      // so we train a parser using the treebank
      if (treebankPath == null) {
        // the next arg must be the treebank path, since it wasn't give earlier
        treebankPath = args[argIndex];
        argIndex++;
        if (args.length > argIndex + 1) {
          try {
            // the next two args might be the range
            int low = Integer.parseInt(args[argIndex]);
            int high = Integer.parseInt(args[argIndex + 1]);
            trainFilter = new NumberRangeFileFilter(low, high, true);
            argIndex += 2;
          } catch (NumberFormatException e) {
            // maybe it's a ranges expression?
            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
            argIndex++;
          }
        }
      }
      Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
View Full Code Here

      System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*");
      return;
    }
    System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
    Treebank tb = new DiskTreebank();
    tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
    // TODO: change this interface so the lexicon creates its own indices?
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    Options op = new Options();
    op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
View Full Code Here

          i++;
        } else {
          throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
        }
        if (numSubArgs == 2) {
          trainFilter = new NumberRangesFileFilter(args[i++], true);
        } else if (numSubArgs >= 3) {
          int low = Integer.parseInt(args[i]);
          int high = Integer.parseInt(args[i + 1]);
          trainFilter = new NumberRangeFileFilter(low, high, true);
          i += 2;
View Full Code Here

    if (args.length < 2) {
      printlnErr("Usage: edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams treesPath fileRange");
    } else {
      Treebank m = tlpp.diskTreebank();
      m.loadPath(args[0], new NumberRangesFileFilter(args[1], false));

      for (Tree t : m ) {
        t.pennPrint(tlpp.pw());
      }
      System.out.println("There were " + m.size() + " trees.");
View Full Code Here

        weight = Double.parseDouble(args[argIndex + numSubArgs - 2]);
        hasWeight = true;
        numSubArgs--;
      }
      if (numSubArgs == 2) {
        filter = new NumberRangesFileFilter(args[argIndex++], true);
      } else if (numSubArgs == 3) {
        try {
          int low = Integer.parseInt(args[argIndex]);
          int high = Integer.parseInt(args[argIndex + 1]);
          filter = new NumberRangeFileFilter(low, high, true);
          argIndex += 2;
        } catch (NumberFormatException e) {
          // maybe it's a ranges expression?
          filter = new NumberRangesFileFilter(args[argIndex++], true);
        }
      }
      if (hasWeight) {
        argIndex++;
      }
View Full Code Here

    }

    final PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);

    if (i + 1 < args.length ) {
      treebank.loadPath(args[i], new NumberRangesFileFilter(args[i+1], true));
    } else if (i < args.length) {
      treebank.loadPath(args[i], suffix, true);
    } else {
      printUsage();
      return;
View Full Code Here

      String pre = filters.get(FilterType.hasPrefix);
      if(!fileName.startsWith(pre))
        return false;
    }
    if(filters.containsKey(FilterType.isInRange)) {
      NumberRangesFileFilter f = new NumberRangesFileFilter(filters.get(FilterType.isInRange), false);
      if(!f.accept(fileName))
        return false;
    }
    return true;
  }
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.io.NumberRangesFileFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.