Package study.string

Source Code of study.string.CleanBlogXML

package study.string;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.lang.StringBuilder;

public class CleanBlogXML {
  public static void main(String[] args) throws IOException,
      InterruptedException {
    long startTime = System.currentTimeMillis();
    String file = "D:/Dropbox/Life/Blog/goldvase.wordpress.2011-07-12.xml";
    int len = (int) (new File(file).length());
    BufferedReader in = new BufferedReader(new FileReader(file));
    StringBuilder blogXML = new StringBuilder(len);
    String aLine;
    int rowNumber = 0;
    while ((aLine = in.readLine()) != null) {
      blogXML.append(aLine);
      blogXML.append('\n');
      rowNumber++;
    }
    in.close();
    long endTime = System.currentTimeMillis();
    double timeUsed = (endTime - startTime) / 1000d;
    System.out.println("The time used for reading text file is: "
        + timeUsed + ".");
    System.out.println("Number of rows read in: " + rowNumber);
    System.out.println("Original length: " + blogXML.length());
    // ---------- remove comments -------------------------------
    startTime = System.currentTimeMillis();
    String commentStart = "<wp:comment>";
    int commentStartLength = commentStart.length();
    String commentEnd = "</wp:comment>";
    int commentEndLength = commentEnd.length();
    int startIndex = blogXML.indexOf(commentStart);
    int endIndex;
    int count = 0;
    while (startIndex > -1) {
      endIndex = blogXML.indexOf(commentEnd, startIndex
          + commentStartLength);
      blogXML = blogXML.delete(startIndex, endIndex + commentEndLength);
      count++;
      if (count % 1000 == 0) {
        System.out.println(count
            + " occurrences are found and removed. Average speed: "
            + count
            / ((System.currentTimeMillis() - startTime) / 1000.)
            + " occurrences per second.");
      }
      startIndex = blogXML.indexOf(commentStart, startIndex);
    }
    System.out.println(count + " occurrences are found and removed.");
    endTime = System.currentTimeMillis();
    timeUsed = (endTime - startTime) / 1000.;
    System.out.println("The time used for replacement is: " + timeUsed
        + ".");
    startTime = System.currentTimeMillis();
    BufferedWriter out = new BufferedWriter(new FileWriter(
        "D:/Dropbox/Life/Blog/cleanBlog.xml"));
    out.write(blogXML.toString());
    out.close();
    endTime = System.currentTimeMillis();
    timeUsed = (endTime - startTime) / 1000.;
    System.out.println("The time used for writing text is: " + timeUsed
        + ".");
  }
}
TOP

Related Classes of study.string.CleanBlogXML

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.