@Test
public void testIt() throws ProtocolException, ParseException, IOException {
String urlString;
Parse parse;
Configuration conf = NutchConfiguration.create();
MimeUtil mimeutil = new MimeUtil(conf);
try {
// read the test string
FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+ sampleText);
StringBuffer sb = new StringBuffer();
int len = 0;
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
char[] buf = new char[1024];
while ((len = isr.read(buf)) > 0) {
sb.append(buf, 0, len);
}
isr.close();
expectedText = sb.toString();
// normalize space
expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("Expected : " + expectedText);
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
if (sampleFiles[i].startsWith("ootest") == false)
continue;
File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
in.close();
WebPage page = new WebPage();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
String mtype = mimeutil.getMimeType(file);
page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse(urlString, page);
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();