python爬取内容出现部分文字乱码

我写爬虫爬小说,内容是爬下来了,但出现了类似乱码但又不像的东西,只有部分字是乱码

img

下面是我打的代码

import requests
import os
import lxml
from bs4 import BeautifulSoup

if name == 'main':
url = 'https://www.linovelib.com/novel/2547/catalog'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text

soup = BeautifulSoup(page_text,'lxml')
li_list = soup.select('.volume-list ul')
li_list = li_list[0]
li_list = str(li_list).split('\n')
print(li_list)
li_list1 = soup.select('.volume-list ul > div')
li_list2 = soup.select('.volume-list li')
Number1 = -1;
Number2 = -1;

for li in li_list:
    if 'v-line' in li:
        print('2')
        Number1 = Number1 + 1;
        if Number1 < 9:
            if not os.path.exists('./' + str(li_list1[Number1].text)):
                os.mkdir('./' + str(li_list1[Number1].text))

    elif 'href' in li:
        if Number2 < len(li_list2):
            print('1')
            Number2 = Number2 + 1
            href = 'https://www.linovelib.com' + li_list2[Number2].a['href']
            href_page = requests.get(url=href,headers=headers,)
            href_page.encoding = 'utf-8'
            href_page_text = href_page.text

            href_soup = BeautifulSoup(href_page_text,'lxml')
            href_list = href_soup.find('div',id='mlfy_main_text')
            content = href_list.text
            print(content)

            fileName = './' + str(li_list1[Number1].text)+ '/' + str(li_list2[Number2].a.string) + '.text'
            fp = open(fileName,'w',encoding='utf-8')
            fp.write(content)

print('nb')

我这里可以的

import requests
import os
import lxml
from bs4 import BeautifulSoup

# if __name__ == '__main__':
url = 'https://www.linovelib.com/novel/2547/catalog'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text

soup = BeautifulSoup(page_text, 'lxml')
li_list = soup.select('.volume-list ul')
li_list = li_list[0]
li_list = str(li_list).split('\n')
print(li_list)
li_list1 = soup.select('.volume-list ul > div')
li_list2 = soup.select('.volume-list li')
Number1 = -1;
Number2 = -1;

for li in li_list:
    if 'v-line' in li:
        print('2')
        Number1 = Number1 + 1;
        if Number1 < 9:
            if not os.path.exists('./' + str(li_list1[Number1].text)):
                os.mkdir('./' + str(li_list1[Number1].text))

    elif 'href' in li:
        if Number2 < len(li_list2):
            print('1')
            Number2 = Number2 + 1
            href = 'https://www.linovelib.com' + li_list2[Number2].a['href']
            href_page = requests.get(url=href, headers=headers, )
            href_page.encoding = 'utf-8'
            href_page_text = href_page.text

            href_soup = BeautifulSoup(href_page_text, 'lxml')
            href_list = href_soup.find('div', id='mlfy_main_text')
            content = href_list.text
            print(content)

            fileName = './' + str(li_list1[Number1].text) + '/' + str(li_list2[Number2].a.string) + '.text'
            fp = open(fileName, 'w', encoding='utf-8')
            fp.write(content)

print("nb")

试试
https://blog.csdn.net/weixin_39605578/article/details/110066426

你代码可能没错,但是你的pycharm未设置编码格式

img

img


可以参考下