/* This code is part of Freenet. It is distributed under the GNU General
* Public License, version 2 (or at your option any later version). See
* http://www.gnu.org/ for further details of the GPL. */
package plugins.Freetalk.ui.NNTP;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import plugins.Freetalk.Board;
import plugins.Freetalk.Message;
import plugins.Freetalk.ui.NNTP.MIME.TransferEncoding;
import freenet.support.Logger;
/**
* Class for parsing input messages (received from the client by way
* of the POST command.)
*/
public class ArticleParser {
private static final Pattern encodedWordPattern = Pattern.compile("=\\?([^\\]\\[()<>@,;:\"/?.=]+)" // charset
+ "\\?([^\\]\\[()<>@,;:\"/?.=]+)" // encoding
+ "\\?([^? ]*)\\?="); // text
/** Author's nickname (local part of mail address) */
private String authorName;
/** Domain part of mail address */
private String authorDomain;
/** Message title (subject) */
private String title;
/** Message date */
private Date date;
/** List of board names */
private ArrayList<String> boards;
/** Board name to send replies */
private String replyToBoard;
/** Message-ID of previous message */
private String parentID;
/** Body of message */
private String text;
/* These booleans are used for preventing the construction of log-strings if logging is disabled (for saving some cpu cycles) */
private static transient volatile boolean logDEBUG = false;
private static transient volatile boolean logMINOR = false;
static {
Logger.registerClass(ArticleParser.class);
}
public ArticleParser() {
authorName = null;
authorDomain = null;
title = null;
date = null;
boards = null;
replyToBoard = null;
parentID = null;
text = null;
}
public String getAuthorName() {
return authorName;
}
public String getAuthorDomain() {
return authorDomain;
}
public String getTitle() {
return title;
}
public List<String> getBoards() {
return boards;
}
public String getReplyToBoard() {
return replyToBoard;
}
public String getParentID() throws NoSuchFieldException {
if(parentID == null)
throw new NoSuchFieldException(); /* TODO: Also throw this in the other getter functions */
return parentID;
}
public String getText() {
return text;
}
public Date getDate() {
return date;
}
/**
* A MIME content type.
*/
private static class ContentType {
public String type;
public String subtype;
public String charset;
public String boundary;
public ContentType(String type, String subtype, String charset, String boundary) {
this.type = type;
this.subtype = subtype;
this.charset = charset;
this.boundary = boundary;
}
/**
* Parse a Content-Type header.
*/
public static ContentType parseHeader(String hdr) {
HeaderTokenizer tokenizer = new HeaderTokenizer(hdr, true, true, "[]()<>@,;:\\/?=");
ContentType result = new ContentType(null, null, "us-ascii", null);
if (tokenizer.getToken() != 0)
return null;
result.type = tokenizer.getTokenText();
if (tokenizer.getToken() != '/')
return null;
if (tokenizer.getToken() != 0)
return null;
result.subtype = tokenizer.getTokenText();
while (tokenizer.getToken() == ';') {
if (tokenizer.getToken() != 0)
break;
String param = tokenizer.getTokenText();
if (tokenizer.getToken() != '=')
break;
if (tokenizer.getToken() != 0)
break;
String value = tokenizer.getTokenText();
if (param.equalsIgnoreCase("charset"))
result.charset = value;
else if (param.equalsIgnoreCase("boundary"))
result.boundary = value;
}
return result;
}
}
/**
* A single mailbox (user at hostname.)
*/
private static class Mailbox {
public String name;
public String local;
public String domain;
public Mailbox(String name, String local, String domain) {
this.name = name;
this.local = local;
this.domain = domain;
}
/**
* Parse a From header and extract the first address. (This does
* not fully parse the header, so it will accept strings that are
* not actually valid mailing addresses.)
*/
public static Mailbox parseHeader(String hdr) {
HeaderTokenizer tokenizer = new HeaderTokenizer(hdr);
String name = null, local = null, domain = null;
StringBuilder current = new StringBuilder();
boolean inDomain = false, inAngles = false;
while (tokenizer.tokensRemaining()) {
int c = tokenizer.getToken();
if (c == '@') {
if (inDomain)
return null;
local = current.toString();
inDomain = true;
current.setLength(0);
}
else if (c == '.') {
current.append('.');
}
else if (c == 0) {
String word = tokenizer.getTokenText();
current.append(word);
}
else if (c == '<') {
if (inAngles || inDomain)
return null;
name = current.toString();
inAngles = true;
current = new StringBuilder();
}
else if (inDomain && c == '>') {
inAngles = false;
break;
}
else if (inDomain && (c == ',' || c == ';')) {
break;
}
else {
name = local = domain = null;
inDomain = inAngles = false;
current = new StringBuilder();
}
}
if (inDomain && !inAngles) {
domain = current.toString();
return new Mailbox(name, local, domain);
}
else
return null;
}
}
/**
* Parse a Newsgroups or Follow-To header and return a list of
* newsgroup names.
*/
private static ArrayList<String> parseNewsgroups(String hdr) {
HeaderTokenizer tokenizer = new HeaderTokenizer(hdr, false, false, ",");
ArrayList<String> result = new ArrayList<String>();
while (tokenizer.tokensRemaining()) {
int c = tokenizer.getToken();
if (c == 0) {
String name = tokenizer.getTokenText();
String boardName = FreetalkNNTPGroup.groupToBoardName(name);
if (Board.isNameValid(boardName))
result.add(boardName);
}
}
return result;
}
/**
* Parse a References header and return a list of message IDs.
*/
private static ArrayList<String> parseReferences(String hdr) {
HeaderTokenizer tokenizer = new HeaderTokenizer(hdr);
StringBuilder current = null;
ArrayList<String> result = new ArrayList<String>();
while (tokenizer.tokensRemaining()) {
int c = tokenizer.getToken();
if (c == '<') {
current = new StringBuilder();
}
else if (c == '>') {
if (current != null)
result.add(current.toString());
current = null;
}
else if (c == 0) {
if (current != null)
current.append(tokenizer.getTokenText());
}
else if (c > 0) {
if (current != null)
current.append((char) c);
}
}
return result;
}
/**
* Decode any encoded words in the header as per RFC 2047
*/
private static String decodeMIMEHeader(String str) {
StringBuilder result = new StringBuilder();
Matcher matcher = encodedWordPattern.matcher(str);
int pos = 0;
while (matcher.find()) {
String charsetName = matcher.group(1);
String encodingName = matcher.group(2);
String data = matcher.group(3);
result.append(str.substring(pos, matcher.start()));
try {
Charset charset = Charset.forName(charsetName);
TransferEncoding encoding = TransferEncoding.headerWordEncoding(encodingName);
byte[] encodedBytes = data.getBytes("US-ASCII");
ByteBuffer decodedBytes = encoding.decode(ByteBuffer.wrap(encodedBytes));
result.append(charset.decode(decodedBytes));
}
catch (Exception e) {
result.append(matcher.group());
}
pos = matcher.end();
}
result.append(str.substring(pos));
return result.toString();
}
/**
* Get the named header contents.
*/
private static String getHeader(String[] headLines, String name) {
int i, j;
for (i = 0; i < headLines.length; i++) {
for (j = 0; j < name.length() && j < headLines[i].length(); j++) {
if (name.charAt(j) != Character.toLowerCase(headLines[i].charAt(j)))
break;
}
if (j < name.length())
continue;
while (j < headLines[i].length()
&& (headLines[i].charAt(j) == ' '
|| headLines[i].charAt(j) == '\t'))
j++;
if (j >= headLines[i].length()
|| headLines[i].charAt(j) != ':')
continue;
j++;
// Skip initial whitespace
while (j < headLines[i].length()
&& (headLines[i].charAt(j) == ' '
|| headLines[i].charAt(j) == '\t'))
j++;
StringBuilder result = new StringBuilder(decodeMIMEHeader(headLines[i].substring(j)));
// Find continuation lines
i++;
while (i < headLines.length
&& headLines[i].length() > 0
&& (headLines[i].charAt(0) == ' '
|| headLines[i].charAt(0) == '\t')) {
result.append('\n');
result.append(decodeMIMEHeader(headLines[i]));
i++;
}
return result.toString();
}
return null;
}
/**
* Parse the message body.
*/
private void parseBody(ByteBuffer bytes, ContentType type, String encodingName) {
// TODO: handle multi-part content, upload non-text parts as
// attachments, etc.
Charset bodyCharset;
try {
bodyCharset = Charset.forName(type.charset);
}
catch (IllegalArgumentException e) {
Logger.error(this, "Illegal charset received", e);
bodyCharset = Charset.forName("UTF-8");
}
try {
TransferEncoding encoding = TransferEncoding.bodyEncoding(encodingName);
ByteBuffer decodedBytes = encoding.decode(bytes);
text = bodyCharset.decode(decodedBytes).toString();
}
catch (Exception e) {
text = bodyCharset.decode(bytes).toString();
}
this.text = parseNNTPQuotesToBBCode(this.text);
}
/**
* Parse the >-styled quotes back to the [quote] syntax.
* @todo Parse the author name eventually, to [quote author=…]
* @todo Try to remove the "On X, Y wrote:" line that newsreaders add
* when replying (hard to do, because there is no clear syntax, and it
* can be written in different languages).
*/
private static String parseNNTPQuotesToBBCode(String body) {
String[] lines = body.split("\n");
int prev = 0;
StringBuilder result = new StringBuilder();
//final Pattern quoteWithAuthorPattern = Pattern.compile("\\((.+)\\) (.+) wrote:\\s*");
for(String l : lines) {
int quoteDepth = 0;
String rawLine = l.trim();
while(rawLine.startsWith(">") || rawLine.startsWith(" ")) {
if(rawLine.startsWith(">")) quoteDepth++;
rawLine = rawLine.substring(1);
}
if(prev < quoteDepth) {
for(int i = 1; i < quoteDepth - prev; ++i) {
result.append("[quote]\n");
}
//Matcher m = quoteWithAuthorPattern.matcher(rawLine);
//if(m.matches()) {
// result.append("[quote author=\"").append(m.group(2));
// result.append("\" message=\"").append(m.group(1));
// result.append("\"]\n");
// rawLine = "";
//} else {
result.append("[quote]\n");
//}
} else if(prev > quoteDepth) {
for(int i = 0; i < prev - quoteDepth; ++i) {
result.append("[/quote]\n");
}
}
if(!rawLine.equals("")) {
result.append(rawLine).append("\n");
}
prev = quoteDepth;
}
return result.toString();
}
/**
* Parse a complete message. The input is given as a byte buffer
* since we do not know what encoding has been used until we parse
* the header. Return true if the message was parsed successfully.
*/
public boolean parseMessage(ByteBuffer bytes) {
// split up message into head + body
ByteBuffer headBytes, bodyBytes;
boolean linestart = true;
// Find the blank line separating head from body
while (bytes.hasRemaining()) {
byte b = bytes.get();
if (bytes.hasRemaining() && b == '\r')
b = bytes.get();
if (b == '\n') {
if (linestart)
break;
else
linestart = true;
}
else
linestart = false;
}
if (!bytes.hasRemaining()) {
if(logDEBUG) Logger.debug(this, "Unable to find start of message body");
return false;
}
// Save remaining (unread) bytes in bodyBytes...
bodyBytes = bytes.slice();
// and previous bytes in headBytes
bytes.flip();
headBytes = bytes.slice();
// First try decoding headers as UTF-8.
Charset utf8 = Charset.forName("UTF-8");
String headUTF8 = utf8.decode(headBytes).toString();
String[] headLines = headUTF8.split("\r?\n");
// Read Newsgroups, Followup-To, and References headers
String newsgroupsHeader = getHeader(headLines, "newsgroups");
String followupToHeader = getHeader(headLines, "followup-to");
String referencesHeader = getHeader(headLines, "references");
String inReplyToHeader = getHeader(headLines, "in-reply-to");
String transferEncodingHeader = getHeader(headLines, "content-transfer-encoding");
String dateHeader = getHeader(headLines, "date");
if (newsgroupsHeader == null) {
if(logDEBUG) Logger.debug(this, "Unable to find Newsgroups header");
return false;
}
// Read Content-Type header...
String typeHeader = getHeader(headLines, "content-type");
ContentType bodyType = null;
if (typeHeader != null)
bodyType = ContentType.parseHeader(typeHeader);
if (bodyType == null)
bodyType = new ContentType("text", "plain", "UTF-8", null);
// ... and then try decoding the headers again using the body
// charset.
Charset bodyCharset;
try {
bodyCharset = Charset.forName(bodyType.charset);
}
catch (IllegalArgumentException e) {
bodyCharset = Charset.forName("UTF-8");
}
headBytes.rewind();
String head = bodyCharset.decode(headBytes).toString();
headLines = head.split("\r?\n");
// Read From and Subject headers using the body charset
String fromHeader = getHeader(headLines, "from");
String subjectHeader = getHeader(headLines, "subject");
if (fromHeader == null) {
if(logDEBUG) Logger.debug(this, "Unable to find From header");
return false;
}
if (subjectHeader == null) {
if(logDEBUG) Logger.debug(this, "Unable to find Subject header");
return false;
}
// Try using the body charset for Newsgroups and Followup-To
// if UTF-8 didn't work.
if (newsgroupsHeader.indexOf(0xfffd) != -1
|| (followupToHeader != null && followupToHeader.indexOf(0xfffd) != -1)) {
newsgroupsHeader = getHeader(headLines, "newsgroups");
followupToHeader = getHeader(headLines, "followup-to");
}
Mailbox addr = Mailbox.parseHeader(fromHeader);
if (addr == null) {
if(logDEBUG) Logger.debug(this, "Unable to parse From header");
return false;
}
authorName = addr.local;
authorDomain = addr.domain;
title = Message.makeTitleValid(subjectHeader);
boards = parseNewsgroups(newsgroupsHeader);
//Parse the date if present in the message
if(dateHeader != null) {
//Strip the optional day of week
if(dateHeader.indexOf(",") != -1)
dateHeader = dateHeader.substring(dateHeader.indexOf(",") + 2);
try {
date = new SimpleDateFormat("d MMM yyyy HH:mm:ss Z", java.util.Locale.US).parse(dateHeader);
}
catch (ParseException e) {
Logger.warning(this, "Failed while parsing date: " + dateHeader, e);
return false;
}
}
if (followupToHeader != null) {
ArrayList<String> followups = parseNewsgroups(followupToHeader);
if (!followups.isEmpty())
replyToBoard = followups.get(0);
}
if (inReplyToHeader != null) {
ArrayList<String> refs = parseReferences(inReplyToHeader);
if (!refs.isEmpty())
parentID = refs.get(refs.size() - 1);
}
else if (referencesHeader != null) {
ArrayList<String> refs = parseReferences(referencesHeader);
if (!refs.isEmpty())
parentID = refs.get(refs.size() - 1);
}
if (transferEncodingHeader != null)
parseBody(bodyBytes, bodyType, transferEncodingHeader);
else
parseBody(bodyBytes, bodyType, "8bit");
return true;
}
}