/**
* LinkExtractor
* Copyright 2011 by Michael Peter Christen
* First released 2.01.2011 at http://yacy.net
*
* $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08. Mär 2011) $
* $LastChangedRevision: 7567 $
* $LastChangedBy: low012 $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.protocol.http;
import java.net.MalformedURLException;
import java.util.WeakHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
public class LinkExtractor {
private static final char lb = '<', rb = '>', dquotes = '"', space = ' ';
private static final Object PRESENT = new Object();
private WeakHashMap<MultiProtocolURI, Object> links;
private Pattern blackpattern;
public LinkExtractor(Pattern blackpattern) {
this.links = new WeakHashMap<MultiProtocolURI, Object>();
this.blackpattern = blackpattern;
}
public void scrape(String text) {
text = text.replace(lb, space).replace(rb, space).replace(dquotes, space);
int p, q, s = 0;
String u;
while (s < text.length()) {
p = Math.min(find(text, "smb://", s), Math.min(find(text, "ftp://", s), Math.min(find(text, "http://", s), find(text, "https://", s))));
if (p == Integer.MAX_VALUE) break;
q = text.indexOf(" ", p + 1);
u = text.substring(p, q < 0 ? text.length() : q);
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 1;
if (this.blackpattern.matcher(u).matches()) continue;
try {links.put(new MultiProtocolURI(u), PRESENT);} catch (MalformedURLException e) {}
}
}
/**
* return the links in the text in the order as they appear
* @return a list of urls
*/
public MultiProtocolURI[] getLinks() {
MultiProtocolURI[] urls = new MultiProtocolURI[this.links.size()];
int i = 0;
for (MultiProtocolURI uri: this.links.keySet()) urls[i++] = uri;
return urls;
}
private static final int find(final String s, final String m, final int start) {
final int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}
}