关于#java#的问题:private List

springboot项目报错
错误信息:
org.jsoup.HttpStatusException: HTTP error fetching URL. Status=404, URL=http://www.bkjy.sdnu.edu.cn/xszq1.htm
相关代码如下:

package com.rainng.studentinformationsystem.manager;
import com.rainng.studentinformationsystem.dao.redis.SdnuNewsDAO;
import com.rainng.studentinformationsystem.model.bo.SdnuNewsBO;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

@Component
public class SdnuNewsManager extends BaseManager {
    private static final int CRAWL_INTERVAL = 60 * 60 * 1000;
    private static final int CRAWL_TIMEOUT = 30 * 1000;
    private static final String CRAWL_TARGET_URL = "http://www.bkjy.sdnu.edu.cn/xszq1.htm";
    private static final String BASE_URL = "http://www.bkjy.sdnu.edu.cn/";

    private final SdnuNewsDAO sdnuNewsDAO;

    public SdnuNewsManager(SdnuNewsDAO sdnuNewsDAO) {
        this.sdnuNewsDAO = sdnuNewsDAO;
    }

    public List getAllNews() {
        Map map = sdnuNewsDAO.getAllNews();

        List newsList = new ArrayList<>(map.size());
        for (String key : map.keySet()) {
            String value = map.get(key);
            // 2019/01/01http://host/path
            String date = value.substring(0, 10);
            String url = value.substring(10);
            newsList.add(new SdnuNewsBO(key, date, url));
        }

        return newsList;
    }

    @Scheduled(fixedDelay = CRAWL_INTERVAL)
    public void crawlNews() {
        Document pageDoc = fetchPage();
        if (pageDoc == null) {
            return;
        }

        List newsList = parseNews(pageDoc);
        sdnuNewsDAO.clear();
        for (SdnuNewsBO news : newsList) {
            sdnuNewsDAO.addNews(news.getTitle(), news.getDate() + news.getUrl());
        }
    }

    private Document fetchPage() {
        Document doc = null;
        try {
            doc = Jsoup.parse(new URL(CRAWL_TARGET_URL), CRAWL_TIMEOUT);
        } catch (IOException ex) {
            ex.printStackTrace();
        }

        return doc;
    }

    private List parseNews(Document pageDoc) {
        Elements elements = pageDoc.body()
                .getElementsByClass("TB3").get(0)
                .getElementsByTag("table").get(0)
                .getElementsByTag("tr");

        List newsList = new ArrayList<>();
        for (Element element : elements) {
            if (!element.attr("id").startsWith("line")) {
                continue;
            }

            Element aTag = element.getElementsByTag("a").get(0);
            Element dateTag = element.getElementsByTag("td").get(2);
            String url = BASE_URL + aTag.attr("href");
            String title = aTag.text();
            String date = dateTag.text();
            newsList.add(new SdnuNewsBO(title, date, url));
        }

        return newsList;
    }
}

错误截图:

img


目前尚不清楚是哪里有问题。

提示jsoup打开那个网址时报错。网址无法打开。请检查网址是否能够打开。我这里也打不开你的网址。望采纳!

你这个页面返回就是 404状态码啊

img

这个错误信息表明爬虫在请求目标网站
http://www.bkjy.sdnu.edu.cn/xszq1.htm

时返回了404错误,即该网站上不存在该页面。可能是因为网站已经更改了结构或者网址,或者本身就不存在该页面。建议检查网站是否可以正常访问,并确认爬虫爬取的网址是否正确。