import urllib2
response = urllib2.urlopen('http://www.pceggs.com/play/pc28.aspx')
html = response.read()
print html
f = open('c:\Users\Administrator\Desktop\pceggs.html','a')
f.write(html)
f.close()
有的页面需要登录才能抓取源码,需要用到cookielib,那我用urllib2的时候咋同时用cookielib访问网站?
我英文不好,谷歌中文没有找到解决方法,懂得说下呗,真心谢了
import cookielib, urllib2
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# default User-Agent ('Python-urllib/2.6') will *not* work
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'),
]
stylesheets = [
'https://www.idcourts.us/repository/css/id_style.css',
'https://www.idcourts.us/repository/css/id_print.css',
]
home = opener.open('https://www.idcourts.us/repository/start.do')
print cj
sessid = cj._cookies['www.idcourts.us']['/repository']['JSESSIONID'].value
# Note the +=
opener.addheaders += [
('Referer', 'https://www.idcourts.us/repository/start.do'),
]
for st in stylesheets:
# da trick
opener.open(st+';jsessionid='+sessid)
search = opener.open('https://www.idcourts.us/repository/partySearch.do')
print cj
# perhaps need to keep updating the referer...