# author:Administrator
# date:2021/04/30
import requests #第三方下载器
import re #正则表达式
import json #格式化数据用
from requests.exceptions import RequestException #做异常处理
from multiprocessing import Pool #使用多进程
def geturl(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
#取慕课主页课程url 放入list
classUrl_list = []
def parse_one_classUrl(html):
pattern = re.compile('.*?<a target="_blank" href="(.*?)">',re.S)
items = re.findall(pattern,html)
classUrl_list.append(items)
# 正则匹配数据
def parse_one_page(html):
pattern = re.compile('.*?<div class="title-box">.*?<h1>(.*?)</h1>'
'.*?<span>难度</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>时长</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>学习人数</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>综合评分</span>.*?<span class="nodistance">(.*?)</span>'
,re.S)
items = re.findall(pattern,html)
for item in items:
# 格式化每一条数据为字典类型的数据
yield {
'title': item[0],
'difficulty': item[1],
'duration': item[2],
'stu_number': item[3],
'comprehensive_evaluation': item[4]
}
#获取课程urlList
def getClassurl(dict):
for class_type in dict:
for stuname in dict[class_type]:
url = geturl(dict[class_type][stuname])
#获取课程urlList
parse_one_classUrl(url)
return classUrl_list
#url不全 拼接地址
Classurladd = []
homeurl='https://coding.imooc.com'
def getaddClassurl(list):
for pagelist in list:
for classurl in pagelist:
Classurladd.append(homeurl + classurl)
return Classurladd
#写入文本
def write_to_file(name,content):
with open('..\\text\%s.txt' %name,'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
dict_qd = {'前端':{'vus.js':'https://coding.imooc.com/?c=vuejs','HTML/CSS':'https://coding.imooc.com/?c=html','JavaScript':'https://coding.imooc.com/?c=javascript','Node.js':'https://coding.imooc.com/?c=nodejs'}}
dict_hd = {'后端':{'java':'https://coding.imooc.com/?c=java','SpringBoot':'https://coding.imooc.com/?c=springboot','SpringCloud':'https://coding.imooc.com/?c=springcloud'}}
dict_ydkf = {'移动开发':{'android':'https://coding.imooc.com/?c=android','ios':'https://coding.imooc.com/?c=ios','Reactnative':'https://coding.imooc.com/?c=reactnative'}}
dict_yun = {'云计算大数据':{'hadoop':'https://coding.imooc.com/?c=hadoop','大数据':'https://coding.imooc.com/?c=bigdata','Spark':'https://coding.imooc.com/?c=spark','Docker':'https://coding.imooc.com/?c=docker'}}
dict_db = {'数据库':{'mysql':'https://coding.imooc.com/?c=mysql','redis':'https://coding.imooc.com/?c=redis','mongodb':'https://coding.imooc.com/?c=mongodb'}}
def main():
url_list = getClassurl(dict_hd)
Classurladd = getaddClassurl(url_list)
print(classUrl_list)
for u in Classurladd:
classhtml = geturl(u)
for item in parse_one_page(classhtml):
write_to_file("dict_hd",item)
if __name__ == '__main__':
main()
我自己写了一个爬慕课网的demo,但速度很慢,想看看大神帮我改进一下,目前我是手动去改dict值。
现在爬到的数据格式是:
{"title": "Spring Cloud Alibaba 大型互联网领域多场景最佳实践", "difficulty": "中级", "duration": "15小时"}
想更改为:
{"stu_name":"后端","title": "Spring Cloud Alibaba 大型互联网领域多场景最佳实践", "difficulty": "中级", "duration": "15小时"}
并且爬的速度太慢了
求求大神,路过帮孩子想想办法
你这里爬的慢,应该是没有使用多线程的缘故(你导入了多进程的库,但你这代码似乎也没有使用)。geturl是耗时操作,单线程情况下,这个操作不结束,后面代码就得等它结束,而且geturl在一轮抓取中好像还调用了不止一次。正确的思路是同时开几个子线程去getClassurl,这几个线程抓取到的结果在各自的线程经过getaddClassurl处理后统一存放到一个队列(Queue)里;之后再开多个子线程,同时向这个队列索取结果并在各自线程继续抓取内容(也就是classhtml = geturl(u)这一步也要使用多线程),最后在各自的线程里执行write_to_file。当然,爬虫爬多了,你会发现大多数情况爬那么快没什么用,因为大多数网站都有严厉的反爬措施,你还得研究怎么处理被对方断线(也就是状态码不再是200)的场景。
至于你说改dict,这不就是给字典添加一个键值对嘛,有什么难的。。。dict_hd["stu_name"]="后端"不就行了,可能是我理解错了你的意思
改进版本:大神看看,有没有在可以改的
# author:Administrator
# date:2021/04/30
import requests #第三方下载器
import re #正则表达式
import json #格式化数据用
from requests.exceptions import RequestException #做异常处理
from multiprocessing import Pool #使用多进程
def geturl(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
homeurl='https://coding.imooc.com'
#取慕课主页课程url 放入list
stuname_dict_url = {}
def parse_one_classUrl(html,stuname):
pattern = re.compile('.*?<a target="_blank" href="(.*?)">',re.S)
items = re.findall(pattern,html)
#url 拼接
items = [homeurl + i for i in items]
stuname_dict_url[stuname] = items
return stuname_dict_url
# 正则匹配数据
def parse_one_page(html,url,stuname):
pattern = re.compile(
'.*?<div class="title-box">.*?<h1>(.*?)</h1>'
'.*?<span>难度</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>时长</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>学习人数</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>综合评分</span>.*?<span class="nodistance">(.*?)</span>'
,re.S)
items = re.findall(pattern,html)
#定义个list 为了格式化
tup_items = items[0] + (url,stuname,)
list = []
list.append(tup_items)
for item in list:
# 格式化每一条数据为字典类型的数据
yield {
'title': item[0],
'difficulty': item[1],
'duration': item[2],
'stu_number': item[3],
'comprehensive_evaluation': item[4],
'url':item[5],
'stuname':item[6]
}
#获取课程urlList
def getClassurl(dict):
for class_type in dict:
for stuname in dict[class_type]:
url = geturl(dict[class_type][stuname])
#获取课程url 是一个字典类型 {name:[url]}
dic = parse_one_classUrl(url,stuname)
return dic
#写入文本
def write_to_file(name,content):
with open('..\\text\%s.txt' %name,'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
dict_qd = {'前端':{'vus.js':'https://coding.imooc.com/?c=vuejs','HTML/CSS':'https://coding.imooc.com/?c=html','JavaScript':'https://coding.imooc.com/?c=javascript','Node.js':'https://coding.imooc.com/?c=nodejs'}}
dict_hd = {'后端':{'java':'https://coding.imooc.com/?c=java','SpringBoot':'https://coding.imooc.com/?c=springboot','SpringCloud':'https://coding.imooc.com/?c=springcloud'}}
dict_ydkf = {'移动开发':{'android':'https://coding.imooc.com/?c=android','ios':'https://coding.imooc.com/?c=ios','Reactnative':'https://coding.imooc.com/?c=reactnative'}}
dict_yun = {'云计算大数据':{'hadoop':'https://coding.imooc.com/?c=hadoop','大数据':'https://coding.imooc.com/?c=bigdata','Spark':'https://coding.imooc.com/?c=spark','Docker':'https://coding.imooc.com/?c=docker'}}
dict_db = {'数据库':{'mysql':'https://coding.imooc.com/?c=mysql','redis':'https://coding.imooc.com/?c=redis','mongodb':'https://coding.imooc.com/?c=mongodb'}}
def main():
pool = Pool(processes=5)
#慕课课程url
url_dict = pool.apply_async(getClassurl,(dict_db,)).get()
for stuname in url_dict:
for url in url_dict[stuname]:
print(stuname,url)
classhtml = pool.apply_async(geturl,(url,)).get()
for item in parse_one_page(classhtml,url,stuname):
write_to_file("dict_db",item)
pool.close()
pool.join()
if __name__ == '__main__':
main()