* @return Language profile instance
* @throws LangDetectException
*/
public static LangProfile loadFromWikipediaAbstract(String lang, File file) throws LangDetectException {
LangProfile profile = new LangProfile(lang);
BufferedReader br = null;
try {
InputStream is = new FileInputStream(file);
if (file.getName().endsWith(".gz")) is = new GZIPInputStream(is);
br = new BufferedReader(new InputStreamReader(is, "utf-8"));
TagExtractor tagextractor = new TagExtractor("abstract", 100);
XMLStreamReader reader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
reader = factory.createXMLStreamReader(br);
while (reader.hasNext()) {
switch (reader.next()) {
case XMLStreamReader.START_ELEMENT:
tagextractor.setTag(reader.getName().toString());
break;
case XMLStreamReader.CHARACTERS:
tagextractor.add(reader.getText());
break;
case XMLStreamReader.END_ELEMENT:
String text = tagextractor.closeTag();
if (text != null) profile.update(text);
break;
}
}
} catch (XMLStreamException e) {
throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file.getName() + "' is an invalid XML.");