/**
* Copyright 2011 Pablo Mendes, Max Jakob
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dbpedia.spotlight.evaluation.external;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.Logger;
import org.dbpedia.spotlight.exceptions.AnnotationException;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.Text;
import java.io.*;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
/**
* This class has been translate to scala. Please use the AnnotationClientScala.scala for new External Clients!
* (AnnotationClientScala.scala is at eval/src/main/scala/org/dbpedia/spotlight/evaluation/external/)
*
* @author pablomendes
*/
public abstract class AnnotationClient {
public Logger LOG = Logger.getLogger(this.getClass());
// Create an instance of HttpClient.
private static HttpClient client = new HttpClient();
public String request(HttpMethod method) throws AnnotationException {
String response = null;
// Provide custom retry handler is necessary
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler(3, false));
try {
// Execute the method.
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
LOG.error("Method failed: " + method.getStatusLine());
}
// Read the response body.
byte[] responseBody = method.getResponseBody(); //TODO Going to buffer response body of large or unknown size. Using getResponseBodyAsStream instead is recommended.
// Deal with the response.
// Use caution: ensure correct character encoding and is not binary data
response = new String(responseBody);
} catch (HttpException e) {
LOG.error("Fatal protocol violation: " + e.getMessage());
throw new AnnotationException("Protocol error executing HTTP request.",e);
} catch (IOException e) {
LOG.error("Fatal transport error: " + e.getMessage());
LOG.error(method.getQueryString());
throw new AnnotationException("Transport error executing HTTP request.",e);
} finally {
// Release the connection.
method.releaseConnection();
}
return response;
}
protected static String readFileAsString(String filePath) throws java.io.IOException{
return readFileAsString(new File(filePath));
}
protected static String readFileAsString(File file) throws IOException {
byte[] buffer = new byte[(int) file.length()];
BufferedInputStream f = new BufferedInputStream(new FileInputStream(file));
f.read(buffer);
return new String(buffer);
}
static abstract class LineParser {
public abstract String parse(String s) throws ParseException;
static class ManualDatasetLineParser extends LineParser {
public String parse(String s) throws ParseException {
return s.trim();
}
}
static class OccTSVLineParser extends LineParser {
public String parse(String s) throws ParseException {
String result = s;
try {
result = s.trim().split("\t")[3];
} catch (ArrayIndexOutOfBoundsException e) {
throw new ParseException(e.getMessage(), 3);
}
return result;
}
}
}
public void saveExtractedEntitiesSet(File inputFile, File outputFile, LineParser parser, int restartFrom) throws Exception {
PrintWriter out = new PrintWriter(outputFile);
LOG.info("Opening input file "+inputFile.getAbsolutePath());
String text = readFileAsString(inputFile);
int i=0;
int correct =0 ;
int error = 0;
int sum = 0;
for (String snippet: text.split("\n")) {
String s = parser.parse(snippet);
if (s!= null && !s.equals("")) {
i++;
if (i<restartFrom) continue;
List<DBpediaResource> entities = new ArrayList<DBpediaResource>();
try {
final long startTime = System.nanoTime();
entities = extract(new Text(snippet.replaceAll("\\s+"," ")));
final long endTime = System.nanoTime();
sum += endTime - startTime;
LOG.info(String.format("(%s) Extraction ran in %s ns.", i, endTime - startTime));
correct++;
} catch (AnnotationException e) {
error++;
LOG.error(e);
e.printStackTrace();
}
for (DBpediaResource e: entities) {
out.println(e.uri());
}
out.println();
out.flush();
}
}
out.close();
LOG.info(String.format("Extracted entities from %s text items, with %s successes and %s errors.", i, correct, error));
LOG.info("Results saved to: "+outputFile.getAbsolutePath());
double avg = (new Double(sum) / i);
LOG.info(String.format("Average extraction time: %s ms", avg * 1000000));
}
public void evaluate(File inputFile, File outputFile) throws Exception {
evaluateManual(inputFile,outputFile,0);
}
public void evaluateManual(File inputFile, File outputFile, int restartFrom) throws Exception {
saveExtractedEntitiesSet(inputFile, outputFile, new LineParser.ManualDatasetLineParser(), restartFrom);
}
// public void evaluateCurcerzan(File inputFile, File outputFile) throws Exception {
// saveExtractedEntitiesSet(inputFile, outputFile, new LineParser.OccTSVLineParser());
// }
/**
* Entity extraction code.
* @param text
* @return
*/
public abstract List<DBpediaResource> extract(Text text) throws AnnotationException;
}