#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
def craw(url, page):
html1 = urllib2.Request(url)
response = urllib2.urlopen(html1).read()
response = str(response)
pat1 = '<img width="220" height="220" class="err-product" data-img="1" src="//.+?" />'
imagelist = re.compile(pat1).findall(response)
x = 1
for imageurl in imagelist:
imagename = "D:/手表/" + str(page) + str(x) + ".jpg"
imagesurl = "http://" + imageurl
try:
urllib.urlretrieve(imagesurl, filename = imagename)
except urllib2.URLError as e:
if hasattr(e, "code"):
x += 1
if hasattr(e, "reason"):
x += 1
x += 1
for i in range(2,6):
url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
craw(url, i)
运行,开始报错:
Traceback (most recent call last):
File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 30, in <module>
craw(url, i)
File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 20, in craw
urllib.urlretrieve(imagesurl, filename = imagename)
File "C:\Python27\lib\urllib.py", line 91, in urlretrieve
return _urlopener.retrieve(url, filename, reporthook, data)
File "C:\Python27\lib\urllib.py", line 237, in retrieve
fp = self.open(url, data)
File "C:\Python27\lib\urllib.py", line 205, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 342, in open_http
h.endheaders(data)
File "C:\Python27\lib\httplib.py", line 951, in endheaders
self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 811, in _send_output
self.send(msg)
File "C:\Python27\lib\httplib.py", line 773, in send
self.connect()
File "C:\Python27\lib\httplib.py", line 754, in connect
self.timeout, self.source_address)
File "C:\Python27\lib\socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 11001] getaddrinfo failed
import urllib
import urllib2
import re
def craw(url, page):
html1 = urllib2.Request(url)
response = urllib2.urlopen(html1).read()
response = str(response)
pat1 = '"//.+?" />'
imagelist = re.compile(pat1).findall(response)
x = 1
print(imagelist)
for imageurl in imagelist:
imagename = str(page) + str(x) + ".jpg"
imagesurl = "http://" + imageurl[3:-4]
print(imagesurl)
try:
urllib.urlretrieve(imagesurl, filename = imagename)
except urllib2.URLError as e:
if hasattr(e, "code"):
x += 1
if hasattr(e, "reason"):
x += 1
x += 1
for i in range(2,6):
url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
craw(url, i)
正则不对,匹配后的内容和http://拼接后不是正确的url,所以打不开,碰到这些问题逐步print出来就能找到哪里错了
本人刚学python 的爬虫,就去爬京东的手表图片练手,然后一直报错,代码,错误都在上面了,求解决呀
先打印一下imagesurl,估计它的地址有问题。或者你的网络不能访问这个URL
https://www.2cto.com/kf/201410/344471.html
首先, python2里面中文字符串要带u开头, 你url就没弄好
其次, 为什么不用requests库, 原生的urllib 默认不带自动urlencode功能,中文可能没encode成百分号那种编码
1、你提取图片网址的正则表达式有问题,你得到的列表应该img标签的列表,而不是网址的列表,应该修改为:pat1 = '',加一个group分组
2、中文目录需要处理一下:imagename=unicode(imagename)
3、小问题,你的编码设置好像有点问题
网络可能不可以访问这个URL
1、你提取图片网址的正则表达式有问题,你得到的列表应该img标签的列表,而不是网址的列表,应该修改为:pat1 = \'\',加一个group分组,我测试应该是可以的
2、中文目录需要处理一下:imagename=unicode(imagename)
3、小问题,你的编码设置好像有点问题
地址和端口号,估计你的地址有问题