Java解析Html获取页面中所有列的值

 

Dom解析方式。利用解析标签的方式来解析内容就行。

Dom4J试试?

import cn.hutool.crypto.SecureUtil;
import cn.hutool.crypto.digest.MD5;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

import javax.annotation.Nullable;
import java.io.ByteArrayInputStream;
import java.time.Duration;
import java.time.LocalTime;
import java.util.*;

/**
 * @author zhaojinhui
 * @date 2021/6/4 15:11
 * @apiNote
 */
public class ElevenTest {
    public static void main(String[] args) {
        String str = "<!DOCTYPE html>\n" +
                "<html lang=\"en\">\n" +
                "<head>\n" +
                "    <meta charset=\"UTF-8\" />\n" +
                "    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\" />\n" +
                "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n" +
                "    <title>Document</title>\n" +
                "</head>\n" +
                "<body>\n" +
                "    <table>\n" +
                "        <tr>\n" +
                "            <th>测试表头1</th>\n" +
                "            <th>测试表头2</th>\n" +
                "        </tr>\n" +
                "        <tr>\n" +
                "            <td>测试列1</td>\n" +
                "            <td>测试列2</td>\n" +
                "        </tr>\n" +
                "        <tr>\n" +
                "            <td>测试列3</td>\n" +
                "            <td>测试列4</td>\n" +
                "        </tr>\n" +
                "    </table>\n" +
                "</body>\n" +
                "</html>";
        ByteArrayInputStream bais = new ByteArrayInputStream(str.getBytes());
        SAXReader saxReader = new SAXReader();
        try {
            Document read = saxReader.read(bais);
            Element htmlTag = read.getRootElement();
            Element bodyTag = htmlTag.element("body");
            Element tableTag = bodyTag.element("table");
            List<Element> trTags = tableTag.elements();
            for (Element trTag : trTags) {
                //列标签
                Element td = trTag.element("td");
                //表头标签
                Element th = trTag.element("th");
                String tdText = "";
                if(td != null){
                    tdText = td.getTextTrim();
                }
                String thText = "";
                if(th != null){
                    thText = th.getTextTrim();
                }
                System.out.println(tdText);
                System.out.println(thText);
            }
        } catch (DocumentException e) {
            e.printStackTrace();
        }
    }

}