使用python写的爬虫
import sys
sys.getdefaultencoding()
from re import *
http = PoolManager()
disable_warnings()
def download(url):
result = http.request('GET',url)
htmlStr = result.data.decode('utf-8')
return htmlStr
def analyse(htmlStr):
aList = findall(']*>',htmlStr)
result = []
for a in aList:
g = search('href[\s]*=[\s]*\'"[\'"]',a)
if g != None:
url = g.group(1)
url = 'http://www.weather.com.cn/' + url
result.append(url)
return result
#print(analyse(download('http://www.weather.com.cn/')))
def crawler(url):
print(url)
html = download(url)
urls = analyse(html)
for url in urls:
crawler(url)
crawler('http://www.weather.com.cn/')
为什么程序运行下来后,会不断的循环www.weather.com.cn这个页面,而不会调到子页面
望大佬指教!
输出看看urls列表里面都有什么东西??