from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行SQLite数据库操作
def main():
baserl = 'https://movie.douban.com/top250?start='
url1 = getat(baserl)
fike = re.compile(r'<a href="(.*?)">')
def getat(baserl):
for i in range(0,10):
url = baserl+str(25*i)
html = gat(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup('div',class_='item'):
item = str(item)
save = []
like = re.findall(fike,item)[0]
print(like)
def gat(url):
# global html
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
''}
a =urllib.request.Request(url,headers=head)
html = ''
try:
response = urllib.request.urlopen(a)
html = response.read().docode('utf-8')
except :
print('14')
return html
if __name__ == '__main__':
main()
print('爬完')
分析过程:
使用以下代码,可以追踪错误信息。
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行SQLite数据库操作
def main():
baserl = 'https://movie.douban.com/top250?start='
url1 = getat(baserl)
fike = re.compile(r'<a href="(.*?)">')
def getat(baserl):
for i in range(0,10):
url = baserl+str(25*i)
html = gat(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup('div',class_='item'):
item = str(item)
save = []
like = re.findall(fike,item)[0]
print(like)
def gat(url):
# global html
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
''}
a =urllib.request.Request(url,headers=head)
html = ''
try:
response = urllib.request.urlopen(a)
html = response.read().docode('utf-8')
except Exception as e:
print(e)
return html
if __name__ == '__main__':
main()
print('爬完')
输出:
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
'bytes' object has no attribute 'docode'
爬完
一看就知道是打错了。
改正后代码:
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行SQLite数据库操作
def main():
baserl = 'https://movie.douban.com/top250?start='
url1 = getat(baserl)
fike = re.compile(r'<a href="(.*?)">')
def getat(baserl):
for i in range(0,10):
url = baserl+str(25*i)
html = gat(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup('div',class_='item'):
item = str(item)
save = []
like = re.findall(fike,item)[0]
print(like)
def gat(url):
# global html
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
''}
a =urllib.request.Request(url,headers=head)
html = ''
try:
response = urllib.request.urlopen(a)
html = response.read().decode('utf-8')
except Exception as e:
print(e)
return html
if __name__ == '__main__':
main()
print('爬完')
本人实测可正常执行。
贴一下报错信息