springboot项目报错
错误信息:
org.jsoup.HttpStatusException: HTTP error fetching URL. Status=404, URL=http://www.bkjy.sdnu.edu.cn/xszq1.htm
相关代码如下:
package com.rainng.studentinformationsystem.manager;
import com.rainng.studentinformationsystem.dao.redis.SdnuNewsDAO;
import com.rainng.studentinformationsystem.model.bo.SdnuNewsBO;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Component
public class SdnuNewsManager extends BaseManager {
private static final int CRAWL_INTERVAL = 60 * 60 * 1000;
private static final int CRAWL_TIMEOUT = 30 * 1000;
private static final String CRAWL_TARGET_URL = "http://www.bkjy.sdnu.edu.cn/xszq1.htm";
private static final String BASE_URL = "http://www.bkjy.sdnu.edu.cn/";
private final SdnuNewsDAO sdnuNewsDAO;
public SdnuNewsManager(SdnuNewsDAO sdnuNewsDAO) {
this.sdnuNewsDAO = sdnuNewsDAO;
}
public List getAllNews() {
Map map = sdnuNewsDAO.getAllNews();
List newsList = new ArrayList<>(map.size());
for (String key : map.keySet()) {
String value = map.get(key);
// 2019/01/01http://host/path
String date = value.substring(0, 10);
String url = value.substring(10);
newsList.add(new SdnuNewsBO(key, date, url));
}
return newsList;
}
@Scheduled(fixedDelay = CRAWL_INTERVAL)
public void crawlNews() {
Document pageDoc = fetchPage();
if (pageDoc == null) {
return;
}
List newsList = parseNews(pageDoc);
sdnuNewsDAO.clear();
for (SdnuNewsBO news : newsList) {
sdnuNewsDAO.addNews(news.getTitle(), news.getDate() + news.getUrl());
}
}
private Document fetchPage() {
Document doc = null;
try {
doc = Jsoup.parse(new URL(CRAWL_TARGET_URL), CRAWL_TIMEOUT);
} catch (IOException ex) {
ex.printStackTrace();
}
return doc;
}
private List parseNews(Document pageDoc) {
Elements elements = pageDoc.body()
.getElementsByClass("TB3").get(0)
.getElementsByTag("table").get(0)
.getElementsByTag("tr");
List newsList = new ArrayList<>();
for (Element element : elements) {
if (!element.attr("id").startsWith("line")) {
continue;
}
Element aTag = element.getElementsByTag("a").get(0);
Element dateTag = element.getElementsByTag("td").get(2);
String url = BASE_URL + aTag.attr("href");
String title = aTag.text();
String date = dateTag.text();
newsList.add(new SdnuNewsBO(title, date, url));
}
return newsList;
}
}
错误截图:
提示jsoup打开那个网址时报错。网址无法打开。请检查网址是否能够打开。我这里也打不开你的网址。望采纳!
你这个页面返回就是 404状态码啊
这个错误信息表明爬虫在请求目标网站
http://www.bkjy.sdnu.edu.cn/xszq1.htm
时返回了404错误,即该网站上不存在该页面。可能是因为网站已经更改了结构或者网址,或者本身就不存在该页面。建议检查网站是否可以正常访问,并确认爬虫爬取的网址是否正确。