参考一下,望采纳
import requests
import re
import os
import wget
import threading
import time
import random
Lock = threading.Lock()
post_dict = {
'catalogue_id': '',
'name':'',
'comments':'',
'var_nonvariable': 'on',
'var_variable':'on',
'var_periodic':'on',
'var_pmin':'',
'var_pmax':'',
'action':'search',
'type_0':'on',
'type_1':'on',
'type_2':'on',
'type_3':'on',
'type_4':'on',
'type_5':'on',
'type_6':'on',
'type_7':'on',
'orb_incl_min':'',
'orb_incl_max':'',
'orb_period_min':'',
'orb_period_max':''
}
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':'csrftoken=9jhoS62afWsXqv1DisuneTFkmWySczcN',
'Host':'mmt.favor2.info',
'Referer': 'http://mmt.favor2.info/satellites',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'
}
def SetDict(id):
global post_dict
post_dict['catalogue_id'] = str(id)
def Download(url,savedir):
print('%s 正在下载 将保存至 %s\n' % (url.strip(),savedir),end = '')
wget.download(url,out = savedir)
print('%s 已下载完毕 已保存至 %s\n' % (url.strip(),savedir),end = '')
def checkStatus(track_id):
print('正在检查: %s\n' % (track_id),end = '')
headers= {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; Tablet PC 2.0; wbx 1.0.0; wbxapp 1.0.0; Zoom 3.6.0)"
}
response = requests.get('http://mmt.favor2.info/satellites/track/%s' % (str(track_id)),headers = headers)
text = response.text.encode(response.encoding).decode(response.apparent_encoding)
tag = "<span class=\"text-default\">Periodic</span></td></tr><tr><td>Lightcurve period"
if tag in text:
global Lock
global download
print('%s 已通过检查\n' % (track_id),end = '')
Lock.acquire()
download.append(track_id)
Lock.release()
else:
print('%s 未通过检查\n' % (track_id),end = '')
def LoadPage(url,savedir):
headers= {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; Tablet PC 2.0; wbx 1.0.0; wbxapp 1.0.0; Zoom 3.6.0)"
}
response = requests.get(url,headers = headers)
text = response.text.encode(response.encoding).decode(response.apparent_encoding)
result = re.findall('<a href=\"/satellites/track/(.*)/download\" title=\"Downoad track\">T</a>',text)
print('%s 中的下载地址已获取未筛选id(共%d项)' % (url,len(result)))
threads = []
global download
download = []
for sid in result:
time.sleep(random.randint(1,10) / 10)
thd = threading.Thread(target = checkStatus,args = (sid,))
thd.start()
threads.append(thd)
while len(threads) != 0:
threads[0].join()
threads.pop(0)
print('%s 中的下载地址已获取以筛选的id(共%d项)' % (url,len(download)))
for sid in download:
time.sleep(random.randint(1,10) / 10)
thd = threading.Thread(target = Download,args = ('http://mmt.favor2.info/satellites/track/%s/download\n' % (sid),os.path.join(savedir,'track_%s.txt' % (sid)),))
thd.start()
threads.append(thd)
while len(threads) != 0:
threads[0].join()
threads.pop(0)
def LoadDownSatelites(id,savedir):
SetDict(id)
global post_dict
response = requests.post("http://mmt.favor2.info/satellites",post_dict)
## with open("test.html","w") as f:
## f.write(response.text)
## with open("test.html","r") as f:
## text = f.read()
text = response.text.encode(response.encoding).decode(response.apparent_encoding)
sid = re.findall('<a href=\"/accounts/login/\?next=/satellites/(.*)\">Log in</a></li>',text)[0]
page = len(re.findall('/satellites/%s\?page=.' % (sid),text)) + 1
print('查找到%d对应的编号%s,共%d页准备下载' % (id,sid,page))
if not os.path.exists(os.path.join(savedir,str(id))):
print('%s不存在,程序已自动创建' % (os.path.join(savedir,str(id))))
os.makedirs(os.path.join(savedir,str(id)))
for pg in range(1,page + 1):
LoadPage('http://mmt.favor2.info/satellites/%s?page=%d' % (sid,pg),os.path.join(savedir,str(id)))
LoadDownSatelites(163,'./Data')
您可以使用Python中的第三方库来获取网页上指定内容并下载, 例如使用requests库获取网页内容和beautifulsoup库解析HTML/XML文件。然后使用urllib库下载指定的内容.
下面是一个简单的例子,它使用requests库获取网页内容,beautifulsoup库解析HTML/XML文件,找到网页上的所有图片链接,并使用urllib库下载这些图片。
import requests
from bs4 import BeautifulSoup
import urllib
url = 'https://www.example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
response = requests.get(url)
file_name = url.split("/")[-1]
with open(file_name, 'wb') as f:
f.write(response.content)
print('Successfully downloaded', file_name)
请注意,在这个例子中, 我们使用的是beautifulsoup库的find_all方法来查找页面上所有的img标签,然后使用列表推导式来提取图片的URL。然后使用requests库的get方法请求图片的URL,并将图片内容写入文件。
如果你是在爬取图片的网站, 你需要先阅读网站的爬取条款,遵守网站爬取规则,以避免违反网站的爬取规则,造成不必要的麻烦.