import urllib2
import urllib
import re
class BDTB:
def init(self,baseUrl,see_LZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(see_LZ)
def getPage(self,pageNum):
try:
url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response =urllib2.urlopen(request)
print response.read()
return response
except urllib2.URLError , e:
if hasattr(e,"reason"):
print u"link fail,reason",e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
def getPageNum(self):
page = self.getPage(1)
print page.read()
pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>', re.S)
result = re.search(pattern, page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
def getContent(self):
page = self.getPage(1)
pattern = re.complie('<div id="post_content_.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page)
for item in items:
print item
baseURL = "http://tieba.baidu.com/p/4866982459"
bdtb = BDTB(baseURL,1)
bdtb.getPage(1)
bdtb.getTitle()
bdtb.getPageNum()
bdtb.getContent()
问问题之前先把报错的traceback和错误提示发出来??
Traceback (most recent call last):
运行getTitle()时候的错误
File "F:\python学习\程序代码\爬虫\123.py", line 52, in
bdtb.getTitle()
File "F:\python学习\程序代码\爬虫\123.py", line 24, in getTitle
result = re.search(pattern,page)
File "C:\Python27\lib\re.py", line 146, in search
return _compile(pattern, flags).search(string)
TypeError: expected string or buffer
运行getContent()的错误
Traceback (most recent call last):
File "F:\python学习\程序代码\爬虫\123.py", line 53, in
bdtb.getContent()
File "F:\python学习\程序代码\爬虫\123.py", line 43, in getContent
pattern = re.complie('
正则双引号需要转义,正则里面别的不知有没有写错,建议使用工具多测试。
另外推荐一个python强大的爬虫html文档分析包,Beautifulsoup