求帮下新手。。有关PYTHON3的基础爬虫类问题

import urllib.request
import os
import re

def into(url):
url="http://www.piaofang168.com/"
response=urllib.request.urlopen(url)
html=response.read().decode('utf-8')

print(html)
return html

def find(url):
findit=into(url).html
findit=re.compile('

(.*?)',re.S)
items=re.findall(find,html)
for item in items:
print (item)
f=open("a.txt","a")
f.write(item)
f.close()

就是它为什么连html都打印不出来,一开始是可以的,就是我用了DEF将它们包装后就运行不了了。。。。

第一次用,改一下排版
1

import urllib.request
import re


def into(url):
    #url = "http://www.piaofang168.com/"
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf-8')

    print(html)
    return html


def find(url):
    #findit = into(url).html
    html = into(url)
    findit = re.compile('(.*?)', re.S)
    # items = re.findall(find, html)
    items = re.findall(findit, html)
    for item in items:
        print(item)
        f = open("a.txt", "a")
        f.write(item)
        f.close()

url = "http://www.piaofang168.com/"
if __name__ == '__main__':
    find(url)

2

#!/usr/bin/env python
#coding:utf-8
import urllib.request
from bs4 import BeautifulSoup


def parse_list(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    req = urllib.request.Request(url, headers=headers)
    page = urllib.request.urlopen(req, timeout=60)
    contents = page.read()
    soup = BeautifulSoup(contents, "lxml")
    for tag in soup.find_all('div', class_='content-list'):
        try:
            data_url = tag.h3.a.attrs['href']
        except AttributeError:
            print("error at:", tag.get_text())
        else:
            if verbose:
                print(data_url)
            parse_data(data_base_url+data_url)


def parse_data(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    req = urllib.request.Request(url, headers=headers)
    page = urllib.request.urlopen(req, timeout=60)
    try:
        contents = page.read().decode('UTF-8')
    except UnicodeDecodeError:
        print("UnicodeDecodeError: " + url)
    else:
        soup = BeautifulSoup(contents, "lxml")
        try:
            tag = soup.find('div', id='homepost')
            # if verbose:
            #     print(tag)
            title = tag.find('div', class_='toptit').h2.get_text()
            if verbose:
                print(title)
            trs_left = tag.find('table', class_="infotable").find_all('tr')
            if verbose:
                print(trs_left)
            read_num = trs_left[1].td.span.get_text()
            download_num = trs_left[2].td.span.get_text()
            download_points = trs_left[3].td.span.get_text()

        except AttributeError:
            print("error at:", url)
        else:
            write_data(title, read_num, download_num, download_points, url)


def write_data(title, read_num, download_num, download_points, url):
    f.write(title + "," + read_num + "," + download_num + "," + download_points + "," + url + "\n")


base_url = 'http://www.codeforge.cn/l/0/c/0/t/0/v/0/p/'
data_base_url = 'http://www.codeforge.cn'
f = open('data.csv', 'w')
verbose = False
if __name__ == '__main__':
    f.write("title, read_num, download_num, download_points, url \n")
    for i in range(1000):
        parse_list(base_url + str(i))
        f.flush()
        print("has finish %s" % str((i+1)*10))


def定義的是函數,你并沒有調用這些函數。

加上調用代碼:
import urllib.request
import re

def into(url):
#url = "http://www.piaofang168.com/"
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')

print(html)
return html

def find(url):
#findit = into(url).html
html = into(url)
findit = re.compile('(.*?)', re.S)
# items = re.findall(find, html)
items = re.findall(findit, html)
for item in items:
print(item)
f = open("a.txt", "a")
f.write(item)
f.close()

url = "http://www.piaofang168.com/"
if name == '__main__':
find(url)

然后推薦你學一下BeautifulSoup這個庫,下面的代碼可做樣例(剛剛被另一個問題坑了一下的代碼,還沒有寫完,第一次準備回答問題就遇到坑,感覺很傷心)
#!/usr/bin/env python
#coding:utf-8
import urllib.request
from bs4 import BeautifulSoup

def parse_list(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(req, timeout=60)
contents = page.read()
soup = BeautifulSoup(contents, "lxml")
for tag in soup.find_all('div', class_='content-list'):
try:
data_url = tag.h3.a.attrs['href']
except AttributeError:
print("error at:", tag.get_text())
else:
if verbose:
print(data_url)
parse_data(data_base_url+data_url)

def parse_data(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(req, timeout=60)
try:
contents = page.read().decode('UTF-8')
except UnicodeDecodeError:
print("UnicodeDecodeError: " + url)
else:
soup = BeautifulSoup(contents, "lxml")
try:
tag = soup.find('div', id='homepost')
# if verbose:
# print(tag)
title = tag.find('div', class_='toptit').h2.get_text()
if verbose:
print(title)
trs_left = tag.find('table', class_="infotable").find_all('tr')
if verbose:
print(trs_left)
read_num = trs_left[1].td.span.get_text()
download_num = trs_left[2].td.span.get_text()
download_points = trs_left[3].td.span.get_text()

    except AttributeError:
        print("error at:", url)
    else:
        write_data(title, read_num, download_num, download_points, url)

def write_data(title, read_num, download_num, download_points, url):
f.write(title + "," + read_num + "," + download_num + "," + download_points + "," + url + "\n")

base_url = 'http://www.codeforge.cn/l/0/c/0/t/0/v/0/p/'
data_base_url = 'http://www.codeforge.cn'
f = open('data.csv', 'w')
verbose = False
if name == '__main__':
f.write("title, read_num, download_num, download_points, url \n")
for i in range(1000):
parse_list(base_url + str(i))
f.flush()
print("has finish %s" % str((i+1)*10))

https://zhidao.baidu.com/question/139948449476860325.html