如题。jsoup获取数据出错。用了代理去访问也不行。
先贴爬取页面的代码
@Slf4j
@Component
public class SpiderUtil {
@Resource
private DynamicIpUtil dynamicIpUtil;
/**
* 根据url爬取页面信息
*
* @param url url
* @return 页面信息
*/
public Document spiderDocument(String url) {
Document pageDoc = null;
try {
Connection con= Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)")
.timeout(5000);
/*.ignoreHttpErrors(true)
.followRedirects(true)*/
Connection.Response resp = con.execute();
if (resp.statusCode() == 200){
pageDoc = con.get();
} else {
log.error("http status error");
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
if(pageDoc == null || pageDoc.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
log.error("ip被拦截 无内容");
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
} catch (Exception e) {
log.error("ip被拦截 异常: {}", e);
dynamicIpUtil.getMyIpInfo();
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
if (ipDefensed(url, pageDoc)) {
// 如果被ip限制了,更换动态ip
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
return pageDoc;
}
/**
* 判断ip是否被封
*
* @param pageDoc 页面信息
* @return ip
*/
private boolean ipDefensed(String url, Document pageDoc) {
boolean ipDefensed = false;
if (url.contains("anjuke.com")) {
ipDefensed = AJKIpDefense(pageDoc);
}
return ipDefensed;
}
/**
* 安居客判断ip是否被封
*
* @param pageDoc 页面信息
*/
private boolean AJKIpDefense(Document pageDoc) {
log.error("ip 被拦截 安居客");
boolean ajkppDefensed = false;
String title = pageDoc.title();
if (title.equals("访问验证-安居客")) {
ajkppDefensed = true;
}
return ajkppDefensed;
}
}
再贴换动态ip的代码
@Slf4j
@Component
public class DynamicIpUtil {
private static List<String[]> ipAndPorts = new ArrayList<String[]>();
private static Integer ipPageNum = 1;
/**
* 更换动态ip
*/
public void changeMyIp() {
String [] ipAndPort = getDynamicIpAndPort();
String ip = ipAndPort[0];
String port = ipAndPort[1];
System.setProperty("http.maxRedirects", "50");
System.setProperty("https.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", ip);
System.getProperties().setProperty("http.proxyPort", port);
System.getProperties().setProperty("https.proxyHost", ip);
System.getProperties().setProperty("https.proxyPort", port);
}
/**
* 获取ip信息
*/
public void getMyIpInfo(){
try {
Document ipDoc = Jsoup.connect("http://www.ip.cn")
.userAgent("Mozilla")
.timeout(3000)
.get();
if(ipDoc != null){
String ipInfo = ipDoc.select(".well").first().text();
log.info("更换ip 成功: {}", ipInfo);
}
} catch (Exception e) {
log.info("暂不能获取ip 信息");
}
}
/**
* 获取动态ip
*
* @return 动态ip
*/
private String[] getDynamicIpAndPort() {
String[] ipAndPort = null;
if (ipAndPorts != null && ipAndPorts.size() > 0) {
ipAndPort = ipAndPorts.get(0);
ipAndPorts.remove(0);
} else {
try {
Document pageDoc = Jsoup.connect("http://www.xicidaili.com/wn/" + ipPageNum)
.userAgent("Mozilla")
.timeout(5000)
.get();
Elements elements = pageDoc.select("tr.odd");
ipPageNum ++;
if(ipPageNum > 400){
ipPageNum = 1;
}
for(Element element : elements){
String[] ipPort = new String[2];
String ip = element.child(1).text();
String port = element.child(2).text();
String noName = element.child(4).text();
// if(!noName.equals("高匿")){
// continue;
// }
String speedStr = element.child(6).select(".bar").first().attr("title");
double speed = Double.valueOf(speedStr.substring(0, speedStr.indexOf("秒")));
String timeStr = element.child(7).select(".bar").first().attr("title");
double time = Double.valueOf(timeStr.substring(0, timeStr.indexOf("秒")));
if(speed <= 1 && time <= 1){
ipPort[0] = ip;
ipPort[1] = port;
ipAndPorts.add(ipPort);
}
}
return getDynamicIpAndPort();
} catch (IOException e) {
log.error("get DynamicIpError error info :\n {}", e);
}
}
return ipAndPort;
}
}
如上。在获取这个网页上的数据的时候会出现问题 https://cd.zu.anjuke.com/fangyuan/p1/
具体的错误有几种。
java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:150)
at java.net.SocketInputStream.read(SocketInputStream.java:121)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:703)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647)
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2000)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)
org.jsoup.HttpStatusException: HTTP error fetching URL
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:590)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:587)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)
java.io.IOException: Unable to tunnel through proxy. Proxy returns "HTTP/1.1 503 Too many open connections"
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2084)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)
java.net.SocketException: Unexpected end of file from server
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:790)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647)
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2000)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:587)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)
请各位高手帮我看下。谢谢了。在线等。急急急急急急!!!!!
代理服务器是匿名代理么?如果不是,还是可以被追踪到ip的。
如果浏览器能打开的话,那就不是ip的问题。一般这个都是因为host,user-agent之类的头信息确实。用firebug看一下头信息,大不了先全贴进去试一下