前端环境是基于Node.js平台,使用HTML、JS、CSS等技术,通过Httpd服务器和HTTP协议实现与用户的交互。开发过程中使用了Vscode编辑器来编写代码。
后端环境是基于Java EE平台,使用Java语言和相关技术(如Maven、Tomcat、SSM框架、MySQL数据库、Spring和mybatis等)来实现服务器端的业务逻辑和数据处理。开发过程中在Windows系统下使用Java环境完成编写。
import urllib.request
import urllib.parse
import json
import jsonpath
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
url_front = 'http://product.dangdang.com/index.php?r=comment%2Flist&productId=1204926048&categoryPath=01.41.26.21.00.00&mainProductId=1204926048&mediumId=0&pageIndex='
url_back = '&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0&template=publish&long_or_short=short'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
}
def handle_request(url):
request = urllib.request.Request(url=url, headers=headers)
return request
def get_response(request):
response = urllib.request.urlopen(request)
return response
def parse_json(json_text):
obj = json.loads(json_text)
ret = jsonpath.jsonpath(obj, '$.data.list.html')
return ret[0]
def main():
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
url = url_front + str(start_page) + url_back
json_text = get_response(handle_request(url=url)).read().decode('gbk')
# print(response.read().decode('gbk'))
# 因为是json对象,所以如果不转成json格式,就显示不了中文
ret = parse_json(json_text)
soup = BeautifulSoup(ret)
commet_list = soup.select('.item_wrap > div')
# print(commet_list[0])
# 去掉首尾中括号
commet_list = str(commet_list)[1:-1]
pattern = re.compile(r'''<div class="comment_items clearfix">
.*?
<em>(.*?)</em>
.*?
<span><a href="(.*?)" target="_blank">(.*?)</a></span>
.*?
<span>(.*?)</span>
.*?
<div class="support" data-comment-id="(.*?)">
.*?
<a class="pic" href="javascript:"><img alt="(.*?)" src="(.*?)"/></a>
.*?''', re.S)
commet_list = pattern.findall(commet_list)
for commet in commet_list:
print(commet)
# print(len(commet_list))
# print(commet_list)
# with open('dangdang.html', 'w', encoding='gbk')as fp:
# fp.write(commet_list)
if __name__ == '__main__':
main()