代码出现的问题具体在哪不清楚,还有怎么解决。
这些代码主要是为了完成项目,爬取豆瓣的电影区内容,大致的都已经写完了,用的cookie池和user-agent池进行反爬操作,但是运行的时候不清楚是哪里最终报错,好像是url,又好像是cookie,最好能指出解决方法。
第一个py文件get_douban
import time
import re
import datetime
import random
import requests
from lxml import etree
import pymysql
# 导入UA池 COOKIE池
from settings import *
# cookie更新
from up_cookies import get_cookies_api
class Getmovie:
def __init__(self, begin, over):
# 开始页码 和结束页码 设置
self.begin_number = begin
self.end_number = over
# 记录日志与异常请求的url
self.f = open(r"./error_log.txt", 'w', encoding="utf-8")
self.f1 = open(r"./error_url.txt", 'a', encoding="utf-8")
self.req_count = 0 # 重复请求次数
self.conn = pymysql.connect(
host=HOST,
user=USER,
password=PASSWORD,
port=PORT,
charset=CHARTSET,
db=DB
)
self.COOKIE_POOL = COOKIE_POOL # 使用COOKIE池来模拟浏览器的Cookie信息。
self.cursor = self.conn.cursor()
def get_page_source(self, url, page):
self.up_cookies(page)
while True:
tmp_cookie = random.choice(self.COOKIE_POOL)
try:
# 使用随机选择的User-Agent和Cookie发送请求,并返回响应中的JSON数据。
headers['User-Agent'] = random.choice(USER_AGENT_POOl)
headers['Cookie'] = tmp_cookie
requst = requests.get(url, headers=headers)
return requst.json()['douban_data']
except Exception as e:
print(f"获取电影页错误 !{e}请重试...")
error_info = f'当前cookie遇到问题:{tmp_cookie} 执行删除!出现错误:{e} 请5秒后继续重试...'
self.COOKIE_POOL.remove(tmp_cookie)
print(error_info)
self.f.write(error_info + '\n')
self.f.write("\n") # 换行,美观
time.sleep(5) # 强制睡眠再执行下一次操作
def up_cookies(self, page):
# 自动化txt文件读取cookie继续采集 无逻辑
if not self.COOKIE_POOL: # 为空,休眠等待激活
print("COOKIE_POOL:dead 程序开始休眠...")
time.sleep(30 * 60) # 休眠30分钟,更新激活新的 COOKIE_POOL 程序,但程序并未结束
self.COOKIE_POOL = get_cookies_api()
get_cookies_api()
print("COOKIE_POOL激活 采集进行中...")
else: # 不为空则忽略,继续采集
pass
def get_detail(self, url, page):
self.up_cookies(page)
while True:
tmp_cookie = random.choice(self.COOKIE_POOL)
try:
# 使用随机选择的User-Agent和Cookie发送请求
headers['User-Agent'] = random.choice(USER_AGENT_POOl)
headers['Cookie'] = tmp_cookie
requst = requests.get(url, headers=headers)
requst.encoding = "utf-8"
return requst.text
except Exception as e:
print(f"获取详情页错误 !{e} 尝试重连...")
error_info = f'当前cookie遇到问题:{tmp_cookie} 执行删除。出现错误:{e} 请5秒后继续重试...'
self.COOKIE_POOL.remove(tmp_cookie)
print(error_info)
self.f.write(error_info + "\n")
self.f.write("\n") # 换行,美观
time.sleep(5) # 强制睡眠再执行下一次操作
def for_mysql(self, dic):
sql = "insert into douban_data(" \
"MovName,fil_maker,writer,star_role,types,years,duration,Pro_region,languages,Rel_date,alias,IMDb,cover_img,score,url,movie_comment,now) " \
"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
datetime_object = str(datetime.datetime.now())
now = datetime_object.split(".")[0]
try:
self.cursor.execute(sql, (
dic['MovName'], dic['fil_maker'], dic['writer'], dic['star_role'], dic['types'], dic['years'],
dic['duration'],
dic['Pro_region'],
dic['languages'], dic['Rel_date'], dic['alias'], dic['IMDb'], dic['cover_img'], dic['score'],
dic['url'],
dic['movie_comment'], now))
self.conn.commit()
print(dic) # 提交就打印内容信息和可能出错的信息
print(dic['MovName'], dic['IMDb'], dic['languages'], dic['Pro_region'], dic['alias'])
except Exception as error:
self.conn.rollback()
database_error_info = f"{dic['MovName']} 错误:{error}"
print(database_error_info)
self.f.write(database_error_info + "\n")
self.f.write("\n")
def parse_detail(self, source_data, li, page):
count, IMDb, languages, Pro_region, alias = 0, None, None, None, None
obj4 = re.compile('<span class="pl">语言:</span> (?P<languages>.*?)<br/>', re.S)
obj3 = re.compile('<span class="pl">制片国家/地区:</span>(?P<Pro_region>.*?)<br/>', re.S)
obj2 = re.compile('<span class="pl">IMDb:</span>(?P<IMDb>.*?)<br>', re.S)
obj1 = re.compile('<span class="pl">又名:</span>(?P<alias>.*?)<br/>', re.S)
title, cover_img, score, url = li['title'], li['cover'], li['rate'], li["url"]
star_role, fil_maker = ",".join(li['star_role']), ",".join(li['fil_maker'])
tree = etree.HTML(source_data)
writer = ",".join(tree.xpath("//div[@id='info']/span[2]/span[@class='attrs']/a/text()"))
types = ",".join(tree.xpath("//div[@id='info']/span[@property='v:genre']/text()"))
Rel_date = tree.xpath("//div[@id='info']/span[@property='v:initialReleaseDate']/text()")
duration = tree.xpath("//div[@id='info']/span[@property='v:runtime']/text()")
if not duration:
duration = "NULL"
if duration != "NULL":
duration = duration[0].split("分钟")[0].strip() + "分钟"
try:
Pro_region = obj3.search(source_data).group("Pro_region").strip()
except:
Pro_region = "NULL"
count += 1
if not cover_img:
cover_img = "NULL"
count += 1
try:
IMDb = obj2.search(source_data).group("IMDb").strip()
except:
IMDb = "NULL"
count += 1
try:
languages = obj4.search(source_data).group("languages").strip()
except:
languages = "NULL"
count += 1
Rel_date = "/".join(Rel_date)
years = Rel_date.split("-")[0].strip()
if "/" in Pro_region:
Pro_region = Pro_region.split("/")[0].strip()
try:
alias = obj1.search(source_data).group("alias").strip()
except:
alias = "NULL"
count += 1
movie_comment = "".join(tree.xpath("//div[@id='link-report']//text()")).replace("\n", "").replace("\t",
"").replace(
" ", "").strip()
if "©豆瓣" or "\u3000" in movie_comment:
movie_comment = movie_comment.replace("©豆瓣", "").replace(" ", "").replace("\u3000", "")
if "(展开全部)" in movie_comment:
movie_comment = movie_comment.split("(展开全部)")[-1].strip()
dic = {
"MovName": title,
"fil_maker": fil_maker,
"writer": writer,
"star_role": star_role,
"types": types,
"years": years,
"duration": duration,
"Pro_region": Pro_region,
"languages": languages,
"Rel_date": Rel_date,
"alias": alias,
"IMDb": IMDb,
"cover_img": cover_img,
"url": url,
"score": score,
"movie_comment": movie_comment,
} # 添加到字典 传参准备数据入库
time.sleep(3)
if self.req_count == 10:
# 记录并且跳过当前url
print(f"{url} 重复采集10次都不行 直接干入txt!")
self.f1.write(f"{url}\n")
return
if count >= 3: # 数据不正常
get_error = f"{title} 出现超过3次NULL 异常 重新获取!当前请求次数: {self.req_count + 1}次"
print(get_error)
self.f.write(get_error + f" 出错的url是{url}" + "\n")
print(title, IMDb, languages, Pro_region, alias)
time.sleep(5)
self.req_count += 1
self.parse_detail(self.get_detail(url, page), li, page)
else: # 数据正常入库
self.for_mysql(dic)
def get_url(self, lis, page):
for li in lis:
# 列表有数据就判断
check = 'select * from douban_data where url="{}";'.format(li['url'])
rs = self.cursor.execute(check)
if rs != 0: # 退出逻辑
print(f"{li['title']} 已经存在 跳过了! ")
continue
self.req_count = 0 # 每次电影更新判断请求的次数
source_data = self.get_detail(li['url'], page)
self.parse_detail(source_data, li, page) # 传网页源码解析接简介 传li{} 拿到数据
time.sleep(random.randint(0, 1)) # 每条数据休息
def main(self):
self.f.write("采集错误日志如下:\n")
self.f1.write("超过重试采集的URL如下:\n")
print("开始采集豆瓣电影:---------")
for page in range(self.begin_number, self.end_number): # 自己调试!控制抓取! 从0开始
url = f"https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start={page * 20}"
self.get_url(self.get_page_source(url, page), page)
time.sleep(5) # 设置休息
print_info = "*" * 20 + f"已经爬取第{page + 1}页数据!!!" + "*" * 20
print(print_info)
self.f.write(print_info + "\n")
self.cursor.close()
self.conn.close()
self.f.close()
self.f1.close()
print("豆瓣电影抓取完毕!")
if __name__ == '__main__':
G = Getmovie(0, 1000) # 这里定义爬取页码
G.main()
第二个文件up_cookies
def get_cookies_api():
cookies = []
with open(r"./cookies.txt", encoding="utf-8")as f:
for line in f:
line = line.strip()
cookies.append(line)
return cookies
if __name__ == '__main__':
tt=get_cookies_api()
print(tt)
第三个文件setting
# ua cookie 每次请求随一个 自动重试!
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'll="108309"; bid=UpCjNCwL8tw; _pk_id.100001.4cf6=86e239bb5d1e40f5.1687157234.; __utma=30149280.2001300900.1687157240.1687157240.1687157240.1; __utmz=30149280.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1597086376.1687157240.1687157240.1687157240.1; __utmz=223695111.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=uWqKariOVI0bFpZLHZIjFI6W53iSEKk5; _vwo_uuid_v2=D2D44595FF9138F07ACF4DFB251FFC001|3b20052ea89a07c78affbf67a665173f; __gads=ID=75f644d5722f297b-22ce9134b9e1008d:T=1687158236:RT=1687158236:S=ALNI_MYZBHb1HXbcOKpvW-3lqy08x4Zk0g; __gpi=UID=00000c517d33a9da:T=1687158236:RT=1687158236:S=ALNI_MZgyjiwI_f-x2VhMv0zo1KZX3nrIw; push_noty_num=0; push_doumail_num=0; ct=y; dbcl2="271505372:glZK6fQJaqM"; ck=PFMx; ap_v=0,6.0',
'Host': 'movie.douban.com',
'Referer': 'https://www.bing.com/',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
} # 请求头设置 反爬
COOKIE_POOL=[
# cookie直接放进来
'll="108309"; bid=UpCjNCwL8tw; _pk_id.100001.4cf6=86e239bb5d1e40f5.1687157234.; __utma=30149280.2001300900.1687157240.1687157240.1687157240.1; __utmz=30149280.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1597086376.1687157240.1687157240.1687157240.1; __utmz=223695111.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=uWqKariOVI0bFpZLHZIjFI6W53iSEKk5; _vwo_uuid_v2=D2D44595FF9138F07ACF4DFB251FFC001|3b20052ea89a07c78affbf67a665173f; __gads=ID=75f644d5722f297b-22ce9134b9e1008d:T=1687158236:RT=1687158236:S=ALNI_MYZBHb1HXbcOKpvW-3lqy08x4Zk0g; __gpi=UID=00000c517d33a9da:T=1687158236:RT=1687158236:S=ALNI_MZgyjiwI_f-x2VhMv0zo1KZX3nrIw; ap_v=0,6.0; dbcl2="271505185:6qFLl5TM21Y"; ck=121A; push_noty_num=0; push_doumail_num=0',
'll="108309"; bid=UpCjNCwL8tw; _pk_id.100001.4cf6=86e239bb5d1e40f5.1687157234.; __utma=30149280.2001300900.1687157240.1687157240.1687157240.1; __utmz=30149280.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1597086376.1687157240.1687157240.1687157240.1; __utmz=223695111.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=uWqKariOVI0bFpZLHZIjFI6W53iSEKk5; _vwo_uuid_v2=D2D44595FF9138F07ACF4DFB251FFC001|3b20052ea89a07c78affbf67a665173f; __gads=ID=75f644d5722f297b-22ce9134b9e1008d:T=1687158236:RT=1687158236:S=ALNI_MYZBHb1HXbcOKpvW-3lqy08x4Zk0g; __gpi=UID=00000c517d33a9da:T=1687158236:RT=1687158236:S=ALNI_MZgyjiwI_f-x2VhMv0zo1KZX3nrIw; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; dbcl2="271505261:qnsYo6Xvles"; ck=U2C_',
'll="108309"; bid=UpCjNCwL8tw; _pk_id.100001.4cf6=86e239bb5d1e40f5.1687157234.; __utma=30149280.2001300900.1687157240.1687157240.1687157240.1; __utmz=30149280.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1597086376.1687157240.1687157240.1687157240.1; __utmz=223695111.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=uWqKariOVI0bFpZLHZIjFI6W53iSEKk5; _vwo_uuid_v2=D2D44595FF9138F07ACF4DFB251FFC001|3b20052ea89a07c78affbf67a665173f; __gads=ID=75f644d5722f297b-22ce9134b9e1008d:T=1687158236:RT=1687158236:S=ALNI_MYZBHb1HXbcOKpvW-3lqy08x4Zk0g; __gpi=UID=00000c517d33a9da:T=1687158236:RT=1687158236:S=ALNI_MZgyjiwI_f-x2VhMv0zo1KZX3nrIw; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; dbcl2="271505281:clRj5psUJfI"; ck=PRI9',
'll="108309"; bid=UpCjNCwL8tw; _pk_id.100001.4cf6=86e239bb5d1e40f5.1687157234.; __utma=30149280.2001300900.1687157240.1687157240.1687157240.1; __utmz=30149280.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1597086376.1687157240.1687157240.1687157240.1; __utmz=223695111.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=uWqKariOVI0bFpZLHZIjFI6W53iSEKk5; _vwo_uuid_v2=D2D44595FF9138F07ACF4DFB251FFC001|3b20052ea89a07c78affbf67a665173f; __gads=ID=75f644d5722f297b-22ce9134b9e1008d:T=1687158236:RT=1687158236:S=ALNI_MYZBHb1HXbcOKpvW-3lqy08x4Zk0g; __gpi=UID=00000c517d33a9da:T=1687158236:RT=1687158236:S=ALNI_MZgyjiwI_f-x2VhMv0zo1KZX3nrIw; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; ct=y; dbcl2="271505372:glZK6fQJaqM"; ck=PFMx',
'll="108309"; bid=UpCjNCwL8tw; _pk_id.100001.4cf6=86e239bb5d1e40f5.1687157234.; __utma=30149280.2001300900.1687157240.1687157240.1687157240.1; __utmz=30149280.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1597086376.1687157240.1687157240.1687157240.1; __utmz=223695111.1687157240.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=uWqKariOVI0bFpZLHZIjFI6W53iSEKk5; _vwo_uuid_v2=D2D44595FF9138F07ACF4DFB251FFC001|3b20052ea89a07c78affbf67a665173f; __gads=ID=75f644d5722f297b-22ce9134b9e1008d:T=1687158236:RT=1687158236:S=ALNI_MYZBHb1HXbcOKpvW-3lqy08x4Zk0g; __gpi=UID=00000c517d33a9da:T=1687158236:RT=1687158236:S=ALNI_MZgyjiwI_f-x2VhMv0zo1KZX3nrIw; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; dbcl2="271505400:2usNW7AOTXA"; ck=eCBI; ct=y',
'll="108309"; bid=K-tN43daiCs; _vwo_uuid_v2=D5FCCE24482ED160B47CA8B3CB7E8B735|720d974a49ff51dd51c0fd4975812bc2; __gads=ID=fbb7af33ff97e8a8-229984c26ad10086:T=1648616100:RT=1648616100:S=ALNI_MaeRcBmY-X6INTzzGBcEVcw0JaJdQ; __yadk_uid=Bn6QEJPc1j83nMzbIA8fynmNsB207i7k; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1650126927; push_doumail_num=0; push_noty_num=0; __utmv=30149280.25272; ct=y; dbcl2="252720001:IawJWSdxtaI"; ck=yHNU; __utmc=30149280; __utmc=223695111; __gpi=UID=000006f72f201cd9:T=1656173534:RT=1656740091:S=ALNI_MYLYJVaAXaHKM1Vd5XX-3Ew-C3Q7A; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1656748693%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D5pbVb7oblf7NlwyETdYp-vDGeCFts4G2V1LSP5ufkfDOCj8dbE9RXLNcbtA0BFx3%26wd%3D%26eqid%3Dfb6918b5000364100000000662bffa90%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.241973065.1641897003.1656743697.1656748693.43; __utmb=30149280.0.10.1656748693; __utmz=30149280.1656748693.43.20.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.54529022.1641897003.1656743700.1656748693.42; __utmb=223695111.0.10.1656748693; __utmz=223695111.1656748693.42.22.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=79195a48ffc8f93e.1641897003.42.1656749196.1656745058',
'll="108309"; bid=K-tN43daiCs; _vwo_uuid_v2=D5FCCE24482ED160B47CA8B3CB7E8B735|720d974a49ff51dd51c0fd4975812bc2; __gads=ID=fbb7af33ff97e8a8-229984c26ad10086:T=1648616100:RT=1648616100:S=ALNI_MaeRcBmY-X6INTzzGBcEVcw0JaJdQ; __yadk_uid=Bn6QEJPc1j83nMzbIA8fynmNsB207i7k; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1650126927; push_doumail_num=0; push_noty_num=0; __utmv=30149280.25272; ct=y; dbcl2="252720001:IawJWSdxtaI"; __gpi=UID=000006f72f201cd9:T=1656173534:RT=1656824272:S=ALNI_MYLYJVaAXaHKM1Vd5XX-3Ew-C3Q7A; __utmz=30149280.1656836817.47.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1656836881.46.23.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ck=yHNU; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1656998335%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dcauz0mfLdw6WezPZnBxSGu8DYHs1S5yTRAZBqfHGIpcxIQFI-jISjgnX0bp-6ELL%26wd%3D%26eqid%3Df797c971000bdf720000000662c3c9b6%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.241973065.1641897003.1656836817.1656998336.48; __utmb=30149280.0.10.1656998336; __utmc=30149280; __utma=223695111.54529022.1641897003.1656836881.1656998336.47; __utmb=223695111.0.10.1656998336; __utmc=223695111; _pk_id.100001.4cf6=79195a48ffc8f93e.1641897003.47.1656998338.1656836881.'
] # cookie池 用户登录信息
USER_AGENT_POOl = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.91 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36',
'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Linux; Android 11.0; Surface Duo) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
'Mozilla/5.0 (Linux; Android 9.0; SAMSUNG SM-F900U Build/PPR1.180610.011) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36',
'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320'
] # UA池
# 数据库信息配置
HOST="localhost"
USER="root" # Mysql数据库用户名
PASSWORD="123456" # Mysql数据库密码
PORT=3306 # 端口号默认
CHARTSET="utf8" # 数据库字符集设置默认
DB="prac_training" # 数据库选择
你这代码我都懒得看,可以教你一招百战百胜的写法,因为有很多数据是xhr的请求,不是根据一条url就可以把数据爬到,为什么不采取用自动化模块包括 Selenium操控谷歌浏览器去模拟人为的浏览网页,等网页都加载完了再随意爬取你需要的数据我觉得更安全也更符合逻辑,因为fetch和xhr的数据爬取url是很难去分析的,时间成本耗不起,容易出错
《小玩意儿》专栏的文章看来被Get_cookie.py下了咒,没错,又是它,实现自动登录的基石,想要了解可以看上边提到的文章哦!
话不多说,直接贴代码:
from selenium import webdriver
from time import sleep
import json
if __name__ == '__main__':
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://passport.csdn.net/login?code=public')
sleep(10)
dictCookies = driver.get_cookies() # 获取list的cookies
jsonCookies = json.dumps(dictCookies) # 转换成字符串保存
with open('csdn_cookies.txt', 'w') as f:
f.write(jsonCookies)
print('cookies保存成功!')
你的程序缺少文件,跑不出来。建议提供具体的错误信息,是报的什么错误。对于爬虫程序来说,没有爬取成功,要不就是你解析方法有问题,如节点的xpath写的不对,要不会就是被网站反爬取了,导致获取不到正确的网页响应。因此,请提供具体的错误信息,好帮你定位。或者你自己使用调试模式。一步步调试看是否执行正确,哪里执行不正确。
可以参考下
import csv
import requests
url='https://movie.douban.com/j/chart/top_list'
start=input('从库中第几部电影取:')
limit=input('取几部:')
param={
'type': '24',
'interval_id': '100:90',
'action':'' ,
'start': start,
'limit': limit}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62'}
response=requests.get(url=url,params=param,headers=headers)
list_data=response.json()
f=open('./豆瓣.csv','w',encoding='utf-8-sig',newline='')#a 是追加保存
csv_write=csv.DictWriter(f, fieldnames=[
'电影名',
'主演人数',
'主演',
'评分',
'上映时间',
'类型',
'评论数',
'拍摄国家',
])
csv_write.writeheader()
for i in list_data:
dic={
'电影名':i['title'],
'主演人数':i['actor_count'],
'主演':i['actors'],
'评分':i['score'],
'上映时间':i['release_date'],
'类型': i['types'],
'评论数': i['vote_count'],
'拍摄国家':i['regions']
}
print(dic)
csv_write.writerow(dic)
根据你提供的代码,有几个潜在的问题可能导致报错:
未导入必要的模块:代码中使用了一些模块和变量,如settings.py和get_cookies_api(),请确保已正确导入这些模块或定义了相应的函数。
变量未定义:代码中使用了一些变量,如HOST、USER、PASSWORD等,请确保这些变量已经定义并赋予了正确的值。
文件路径问题:代码中涉及到读取文件的路径,如"./error_log.txt"和"./cookies.txt",请确保这些文件存在于指定路径中。
异常处理问题:代码中使用了异常处理,但是并没有打印详细的错误信息,只是简单地输出了错误提示。建议将异常处理部分修改为打印完整的错误信息,以便更好地定位问题。例如,可以将except Exception as e修改为except Exception as e: print(f"获取电影页错误!{e}请重试..."),以便在出错时打印具体的错误信息。
提示cookie错误,那是网站做了cookie校验,得弄出正确的cookie设置到请求头中