System.out.println(
"usage: java NsfAbstractCleaner <abstract_dir> <out_file>");
System.exit(1);
}
DocumentPreprocessor processor = new DocumentPreprocessor();
PrintWriter pw = new PrintWriter(args[1]);
File baseAbstractDir = new File(args[0]);
// Iterate over the year directories in the main directory.
for (File abstractYearDir : baseAbstractDir.listFiles()) {
// Skip files that are not directories and files that do not start
// with "awards".
if (!abstractYearDir.isDirectory() ||
!abstractYearDir.getName().startsWith("awards"))
continue;
// Each NSF award year directory is split into several
// subdirectories, iterate over each one.
for (File abstractPartDir : abstractYearDir .listFiles()) {
// Skip any non directory entries, such as links.html.
if (!abstractPartDir.isDirectory())
continue;
// Iterate over each award.
for (File awardFile : abstractPartDir.listFiles()) {
BufferedReader br =
new BufferedReader(new FileReader(awardFile));
StringBuilder sb = new StringBuilder();
boolean startedContent = false;
// Scan through the posting to find the "Abstract" line.
// This line marks the beginning of the real abstract.
for (String line = null; (line = br.readLine()) != null; ) {
if (startedContent)
sb.append(line).append(" ");
if (line.startsWith("Abstract"))
startedContent = true;
}
// Clean and write the posting's content to the output file.
sb.append("\n");
String cleanedContent = processor.process(sb.toString());
System.out.println(awardFile.getAbsolutePath());
pw.printf("%s\n", cleanedContent);
br.close();
}
}