Python爬虫遇到了问题

文章有点冗长还请耐心,我打算用wordpress做一个导航网站,我想做一个爬虫程序用来自动抓我网站中需要的网站网址。我没什么python编程的经验,所以我让ChatGPT写了几次程序但是都失败了。所以想询问一下,下面是代码:

# 设置你需要的变量
LOGGER_FILE_NAME = "website_navigation.log"
SEARCH_URL = "htxxps://www.google.com/search"
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
WORDPRESS_URL = "http://118.178.235.228/"
USERNAME = "admin"
PASSWORD = "hjZ5d4zH"
CATEGORY_ID = "3"
SLEEP_DELAY = 5
SEARCH_QUERY = "动漫"  # 搜索查询
WEBSITE_TYPE = "视频网站"  # 网站类型

# 导入需要的库
import requests
from bs4 import BeautifulSoup
import time
import logging
from concurrent.futures import ThreadPoolExecutor

# 配置日志,包含日志级别、格式和处理方式,这里配置了文件处理器和流处理器
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(LOGGER_FILE_NAME),  # 日志写入文件
        logging.StreamHandler()  # 控制台输出日志
    ]
)

# 根据类型搜索网站的函数
def search_website_by_type(delay=SLEEP_DELAY):  
    session = requests.Session()  # 创建一个会话
    logging.info(f"正在搜索 {SEARCH_QUERY} {WEBSITE_TYPE}")  # 输出日志
    headers = {  # 请求头
        'User-Agent': USER_AGENT}
    params = {  # 请求参数
        'q': f'{SEARCH_QUERY} {WEBSITE_TYPE}'}

# 将链接上传到wordpress的函数
def upload_links_to_wordpress(session, links):
    try:
        response = session.get(SEARCH_URL, headers=headers, params=params)  # 发送get请求
        if response.status_code == 200:  # 当响应状态码为200时
            soup = BeautifulSoup(response.text, 'html.parser')  # 解析网页
            links = [link.get('href') for link in soup.find_all('a', href=True) if link['href'].startswith('http')]  # 对解析后的网页提取链接
            logging.info(f"找到 {len(links)} 个链接")  # 输出日志
            time.sleep(SLEEP_DELAY)  # 延迟一定时间
            return links  # 返回链接
        else:
            logging.error("Error occurred while fetching links")  # 当获取链接出错时,输出日志
    except Exception as e:
        logging.error(f"Exception occurred while fetching links: {str(e)}")  # 当出现异常时,输出日志

# 将链接上传到wordpress的函数
def upload_links_to_wordpress( links, category_id, wordpress_url, username, password, delay=SLEEP_DELAY):
    session = requests.Session()  # 创建一个会话
    logging.info(f"正在上传 {len(links)} 个链接到 WordPress")  # 输出日志
    data = {  # 请求数据
        "slug": "links-collection",
        "title": "Links Collection",
        "content": "Check out these useful links:",
        "status": "publish",
        "categories": category_id,
        "password": password
    }
    headers = {  # 请求头
        "Authorization": f"Basic {username}:{password}"
    }
    try:
        response = session.post(wordpress_url, headers=headers, data=data)  # 发送post请求
        if response.status_code == 200:  # 当响应状态码为200时
            logging.info("Links uploaded successfully to WordPress")  # 输出日志
            time.sleep(delay)  # 延迟一定时间
        else:
            logging.error("Error occurred while uploading links")  # 当上传链接出错时,输出日志
    except Exception as e:
        logging.error(f"Exception occurred while uploading links: {str(e)}")  # 当出现异常时,输出日志
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import NewPost

# 定义 wordPress 上传函数
def upload_links_to_wordpress( links, CATEGORY_ID, WORDPRESS_URL, USERNAME, PASSWORD):
    try:
        wp = Client(WORDPRESS_URL, USERNAME, PASSWORD)  # 创建wordpress客户端
        for link in links:
            post = WordPressPost()  # 创建一篇新文章
            post.title = link  # 将链接设置为文章的标题
            post.content = link  # 将链接设置为文章的内容
            post.terms_names = {
                'post_tag': ['tag1', 'tag2', 'tag3'],
                'category': [CATEGORY_ID]  # 将文章分类设置为特定的分类ID
            }
            post.id = wp.call(NewPost(post))  # 在wordpress上发布新文章
            logging.info(f"Uploaded link {link} to WordPress")  # 输出上传成功的日志
    except Exception as e:
        logging.error(f"Exception occurred while uploading links to WordPress: {str(e)}")  # 当出现异常时,输出日志
for i in range(5):  # 控制搜索5次
    links = search_website_by_type( WEBSITE_TYPE)
    upload_links_to_wordpress(CATEGORY_ID, WORDPRESS_URL, USERNAME,PASSWORD, links)  # 调用上传函数将链接上传到

下面是在不同地方运行之后出现的情况。
这是在Lightly上面运行出现的:

img


这是在阿里云服务器后台运行的:

img

img

img

img

img

img

img


还有在本地运行的:

print("Start")
Start
>>> # 设置你需要的变量
... LOGGER_FILE_NAME = "website_navigation.log"
>>> SEARCH_URL = "htxxps://www.google.com/search"
>>> USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
>>> WORDPRESS_URL = "http://118.178.235.228/"
>>> USERNAME = "admin"
>>> PASSWORD = "hjZ5d4zH"
>>> CATEGORY_ID = "3"
>>> SLEEP_DELAY = 5
>>> SEARCH_QUERY = "动漫"  # 搜索查询
>>> WEBSITE_TYPE = "视频网站"  # 网站类型
>>>
>>> # 导入需要的库
... import requests
>>> from bs4 import BeautifulSoup
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ModuleNotFoundError: No module named 'bs4'
>>> import time
>>> import logging
>>> from concurrent.futures import ThreadPoolExecutor
>>>
>>> # 配置日志,包含日志级别、格式和处理方式,这里配置了文件处理器和流处理器
... logging.basicConfig(
...     level=logging.INFO,
...     format="%(asctime)s [%(levelname)s] %(message)s",
...     handlers=[
...         logging.FileHandler(LOGGER_FILE_NAME),  # 日志写入文件
...         logging.StreamHandler()  # 控制台输出日志
...     ]
... )
>>>
>>> # 根据类型搜索网站的函数
... def search_website_by_type(delay=SLEEP_DELAY):
...     session = requests.Session()  # 创建一个会话
...     logging.info(f"正在搜索 {SEARCH_QUERY} {WEBSITE_TYPE}")  # 输出日志
...     headers = {  # 请求头
...         'User-Agent': USER_AGENT}
...     params = {  # 请求参数
...         'q': f'{SEARCH_QUERY} {WEBSITE_TYPE}'}
...
>>> # 将链接上传到wordpress的函数
... def upload_links_to_wordpress(session, links):
...     try:
...         response = session.get(SEARCH_URL, headers=headers, params=params)  # 发送get请求
...         if response.status_code == 200:  # 当响应状态码为200时
...             soup = BeautifulSoup(response.text, 'html.parser')  # 解析网页
...             links = [link.get('href') for link in soup.find_all('a', href=True) if link['href'].startswith('http')]  # 对解析后的网页提取链接
...             logging.info(f"找到 {len(links)} 个链接")  # 输出日志
...             time.sleep(SLEEP_DELAY)  # 延迟一定时间
...             return links  # 返回链接
...         else:
...             logging.error("Error occurred while fetching links")  # 当获取链接出错时,输出日志
...     except Exception as e:
...         logging.error(f"Exception occurred while fetching links: {str(e)}")  # 当出现异常时,输出日志
...
>>> # 将链接上传到wordpress的函数
... def upload_links_to_wordpress( links, category_id, wordpress_url, username, password, delay=SLEEP_DELAY):
...     session = requests.Session()  # 创建一个会话
...     logging.info(f"正在上传 {len(links)} 个链接到 WordPress")  # 输出日志
...     data = {  # 请求数据
...         "slug": "links-collection",
...         "title": "Links Collection",
...         "content": "Check out these useful links:",
...         "status": "publish",
...         "categories": category_id,
...         "password": password
...     }
...     headers = {  # 请求头
...         "Authorization": f"Basic {username}:{password}"
...     }
...     try:
...         response = session.post(wordpress_url, headers=headers, data=data)  # 发送post请求
...         if response.status_code == 200:  # 当响应状态码为200时
...             logging.info("Links uploaded successfully to WordPress")  # 输出日志
...             time.sleep(delay)  # 延迟一定时间
...         else:
...             logging.error("Error occurred while uploading links")  # 当上传链接出错时,输出日志
...     except Exception as e:
...         logging.error(f"Exception occurred while uploading links: {str(e)}")  # 当出现异常时,输出日志
... from wordpress_xmlrpc import Client, WordPressPost
  File "<stdin>", line 25
    from wordpress_xmlrpc import Client, WordPressPost
       ^
SyntaxError: invalid syntax
>>> from wordpress_xmlrpc.methods.posts import NewPost
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ModuleNotFoundError: No module named 'wordpress_xmlrpc'
>>>
>>> # 定义 wordPress 上传函数
... def upload_links_to_wordpress( links, CATEGORY_ID, WORDPRESS_URL, USERNAME, PASSWORD):
...     try:
...         wp = Client(WORDPRESS_URL, USERNAME, PASSWORD)  # 创建wordpress客户端
...         for link in links:
...             post = WordPressPost()  # 创建一篇新文章
...             post.title = link  # 将链接设置为文章的标题
...             post.content = link  # 将链接设置为文章的内容
...             post.terms_names = {
...                 'post_tag': ['tag1', 'tag2', 'tag3'],
...                 'category': [CATEGORY_ID]  # 将文章分类设置为特定的分类ID
...             }
...             post.id = wp.call(NewPost(post))  # 在wordpress上发布新文章
...             logging.info(f"Uploaded link {link} to WordPress")  # 输出上传成功的日志
...     except Exception as e:
...         logging.error(f"Exception occurred while uploading links to WordPress: {str(e)}")  # 当出现异常时,输出日志
... for i in range(5):  # 控制搜索5次
  File "<stdin>", line 17
    for i in range(5):  # 控制搜索5次
      ^
SyntaxError: invalid syntax
>>>     links = search_website_by_type(WEBSITE_TYPE)
  File "<stdin>", line 1
    links = search_website_by_type(WEBSITE_TYPE)
    ^
IndentationError: unexpected indent
>>>     upload_links_to_wordpress( links, CATEGORY_ID, WORDPRESS_URL, USERNAME, PASSWORD)  # 调用上传函数将链接上传到
  File "<stdin>", line 1
    upload_links_to_wordpress( links, CATEGORY_ID, WORDPRESS_URL, USERNAME, PASSWORD)  # 调用上传函数将链接上传到
    ^
IndentationError: unexpected indent

【以下回答由 GPT 生成】

无法解答该问题。



【相关推荐】



如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^