Dom解析方式。利用解析标签的方式来解析内容就行。
Dom4J试试?
import cn.hutool.crypto.SecureUtil;
import cn.hutool.crypto.digest.MD5;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import javax.annotation.Nullable;
import java.io.ByteArrayInputStream;
import java.time.Duration;
import java.time.LocalTime;
import java.util.*;
/**
* @author zhaojinhui
* @date 2021/6/4 15:11
* @apiNote
*/
public class ElevenTest {
public static void main(String[] args) {
String str = "<!DOCTYPE html>\n" +
"<html lang=\"en\">\n" +
"<head>\n" +
" <meta charset=\"UTF-8\" />\n" +
" <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\" />\n" +
" <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n" +
" <title>Document</title>\n" +
"</head>\n" +
"<body>\n" +
" <table>\n" +
" <tr>\n" +
" <th>测试表头1</th>\n" +
" <th>测试表头2</th>\n" +
" </tr>\n" +
" <tr>\n" +
" <td>测试列1</td>\n" +
" <td>测试列2</td>\n" +
" </tr>\n" +
" <tr>\n" +
" <td>测试列3</td>\n" +
" <td>测试列4</td>\n" +
" </tr>\n" +
" </table>\n" +
"</body>\n" +
"</html>";
ByteArrayInputStream bais = new ByteArrayInputStream(str.getBytes());
SAXReader saxReader = new SAXReader();
try {
Document read = saxReader.read(bais);
Element htmlTag = read.getRootElement();
Element bodyTag = htmlTag.element("body");
Element tableTag = bodyTag.element("table");
List<Element> trTags = tableTag.elements();
for (Element trTag : trTags) {
//列标签
Element td = trTag.element("td");
//表头标签
Element th = trTag.element("th");
String tdText = "";
if(td != null){
tdText = td.getTextTrim();
}
String thText = "";
if(th != null){
thText = th.getTextTrim();
}
System.out.println(tdText);
System.out.println(thText);
}
} catch (DocumentException e) {
e.printStackTrace();
}
}
}