Package edu.ucla.sspace.text

Examples of edu.ucla.sspace.text.DocumentPreprocessor$DocHash


            System.out.println(
                    "usage: java NsfAbstractCleaner <abstract_dir> <out_file>");
            System.exit(1);
        }

        DocumentPreprocessor processor = new DocumentPreprocessor();
        PrintWriter pw = new PrintWriter(args[1]);

        File baseAbstractDir = new File(args[0]);
        // Iterate over the year directories in the main directory.
        for (File abstractYearDir : baseAbstractDir.listFiles()) {

            // Skip files that are not directories and files that do not start
            // with "awards".
            if (!abstractYearDir.isDirectory() ||
                !abstractYearDir.getName().startsWith("awards"))
                continue;

            // Each NSF award year directory is split into several
            // subdirectories, iterate over each one.
            for (File abstractPartDir : abstractYearDir .listFiles()) {

                // Skip any non directory entries, such as links.html.
                if (!abstractPartDir.isDirectory())
                    continue;

                // Iterate over each award.
                for (File awardFile : abstractPartDir.listFiles()) {
                    BufferedReader br =
                        new BufferedReader(new FileReader(awardFile));
                    StringBuilder sb = new StringBuilder();
                    boolean startedContent = false;

                    // Scan through the posting to find the "Abstract" line.
                    // This line marks the beginning of the real abstract.
                    for (String line = null; (line = br.readLine()) != null; ) {
                        if (startedContent)
                            sb.append(line).append(" ");
                        if (line.startsWith("Abstract"))
                            startedContent = true;
                    }

                    // Clean and write the posting's content to the output file.
                    sb.append("\n");
                    String cleanedContent = processor.process(sb.toString());
                    System.out.println(awardFile.getAbsolutePath());
                    pw.printf("%s\n", cleanedContent);
                    br.close();
                }
            }
View Full Code Here


        String article = rawArticleText.toString();

        // Being removing any tokens according to the options
        if (options.contains(CleanerOption.USE_PREPROCESSOR)) {
            LOGGER.finer("applying preprocessor");
            article = new DocumentPreprocessor().process(article);
        }
        if (options.contains(CleanerOption.FILTER_TOKENS)) {
            LOGGER.finer("filtering tokens");
            article = filterTokens(article);
        }
View Full Code Here

  try {
      if (args.length != 2) {
    usage();
    return;
      }
      DocumentPreprocessor processor = new DocumentPreprocessor();
      BufferedReader br = new BufferedReader(new FileReader(args[0]));
      BufferedWriter bw = new BufferedWriter(new FileWriter(args[1]));
      for (String line = null; (line = br.readLine()) != null;) {
    String cleaned = processor.process(line);
    if (!cleaned.equals("")){
        bw.write(cleaned);
        bw.newLine();
    }
      }
View Full Code Here

            System.out.println(
                    "usage: java TwentyNewsGroupCleaner <ng_dir> <out_file>");
            System.exit(1);
        }

        DocumentPreprocessor processor = new DocumentPreprocessor();
        PrintWriter pw = new PrintWriter(args[1]);

        File baseNGDir = new File(args[0]);
        // Iterate over the newsgroup directories in the main directory.
        for (File newsGroupDir : baseNGDir.listFiles()) {

            // Skip any non-directories.
            if (!newsGroupDir.isDirectory())
                continue;

            // Iterate over the individual postings in each newsgroup.
            for (File newsGroupEntry : newsGroupDir.listFiles()) {
                BufferedReader br =
                    new BufferedReader(new FileReader(newsGroupEntry));
                StringBuilder sb = new StringBuilder();
                boolean startedContent = false;

                // Scan through the posting to find the "Lines" line.  This line
                // marks the beginning of the real newsgroup data.
                for (String line = null; (line = br.readLine()) != null; ) {
                    if (startedContent)
                        sb.append(line).append(" ");
                    if (line.startsWith("Lines:"))
                        startedContent = true;
                }

                // Clean and write the posting's content to the output file.
                sb.append("\n");
                String cleanedContent = processor.process(sb.toString());
                System.out.println(newsGroupEntry.getAbsolutePath());
                pw.printf("%s\n", cleanedContent);
                br.close();
            }
        }
View Full Code Here

    PrintWriter writer = null;
    beginTime = begin;
    endTime = end;
    try {
      writer = new PrintWriter(outFile);
      processor = new DocumentPreprocessor(wordFile);
    } catch (FileNotFoundException fnee) {
      fnee.printStackTrace();
      System.exit(1);
    } catch (IOException ioe) {
      ioe.printStackTrace();
View Full Code Here

TOP

Related Classes of edu.ucla.sspace.text.DocumentPreprocessor$DocHash

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.