java爬虫webMagic用正则表达式匹配a标签的onclick里面queryArticleByCondition方法里的this后面的值,就是一个URL地址
<a style="cursor:pointer" onclick="queryArticleByCondition(this,'/liuyanggov/dwzt/ggzyjyzx/jyxx96/fjsz34/zbgg97/2a7bc3f8-3.html')" tagname="/liuyanggov/dwzt/ggzyjyzx/jyxx96/fjsz34/zbgg97/2a7bc3f8-3.html">下一页</a>
可以定义一个Selector类筛选a标签,参考demo:
public class LinksSelector extends BaseElementSelector {
public LinksSelector() {
}
@Override
public String select(Element element) {
throw new UnsupportedOperationException();
}
@Override
public List<String> selectList(Element element) {
Elements elements = element.select(LinkTag.HREF.toString()+LinkTag.NOTNA.toString());
List<String> links = new ArrayList(elements.size());
Iterator var4 = elements.iterator();
while (var4.hasNext()) {
Element element0 = (Element) var4.next();
if (!StringUtil.isBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));
}
}
return links;
}
@Override
public Element selectElement(Element element) {
throw new UnsupportedOperationException();
}
@Override
public List<Element> selectElements(Element element) {
throw new UnsupportedOperationException();
}
@Override
public boolean hasAttribute() {
return true;
}
}