package study.string;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.lang.StringBuilder;
public class CleanBlogXML {
public static void main(String[] args) throws IOException,
InterruptedException {
long startTime = System.currentTimeMillis();
String file = "D:/Dropbox/Life/Blog/goldvase.wordpress.2011-07-12.xml";
int len = (int) (new File(file).length());
BufferedReader in = new BufferedReader(new FileReader(file));
StringBuilder blogXML = new StringBuilder(len);
String aLine;
int rowNumber = 0;
while ((aLine = in.readLine()) != null) {
blogXML.append(aLine);
blogXML.append('\n');
rowNumber++;
}
in.close();
long endTime = System.currentTimeMillis();
double timeUsed = (endTime - startTime) / 1000d;
System.out.println("The time used for reading text file is: "
+ timeUsed + ".");
System.out.println("Number of rows read in: " + rowNumber);
System.out.println("Original length: " + blogXML.length());
// ---------- remove comments -------------------------------
startTime = System.currentTimeMillis();
String commentStart = "<wp:comment>";
int commentStartLength = commentStart.length();
String commentEnd = "</wp:comment>";
int commentEndLength = commentEnd.length();
int startIndex = blogXML.indexOf(commentStart);
int endIndex;
int count = 0;
while (startIndex > -1) {
endIndex = blogXML.indexOf(commentEnd, startIndex
+ commentStartLength);
blogXML = blogXML.delete(startIndex, endIndex + commentEndLength);
count++;
if (count % 1000 == 0) {
System.out.println(count
+ " occurrences are found and removed. Average speed: "
+ count
/ ((System.currentTimeMillis() - startTime) / 1000.)
+ " occurrences per second.");
}
startIndex = blogXML.indexOf(commentStart, startIndex);
}
System.out.println(count + " occurrences are found and removed.");
endTime = System.currentTimeMillis();
timeUsed = (endTime - startTime) / 1000.;
System.out.println("The time used for replacement is: " + timeUsed
+ ".");
startTime = System.currentTimeMillis();
BufferedWriter out = new BufferedWriter(new FileWriter(
"D:/Dropbox/Life/Blog/cleanBlog.xml"));
out.write(blogXML.toString());
out.close();
endTime = System.currentTimeMillis();
timeUsed = (endTime - startTime) / 1000.;
System.out.println("The time used for writing text is: " + timeUsed
+ ".");
}
}