Python 输入关键词获取图书名 和ISBN
例如输入关键词 #西游记
结果:
书名,ISBN
西游记,978**
西游记女儿国,978**
网站有吗?百度不到新华文轩网
需求太模糊,从哪里获取?具体是怎么获取?读数据书还是怎么样?
import urllib
import urllib.request
import json
def __getInfoFromDouban(isbn):
try:
#将isbn作为变量传递到url中,得到对应的地址
url = 'https://api.douban.com/v2/book/isbn/'+isbn
#使用urllib模块打开url
response = urllib.request.urlopen(url)
#读取url的网页内容,并用utf8编码
result = response.read().decode('utf8')
#将返回的字符串转成json格式
result_json = json.loads(result)
#信息获取失败,抛出一个异常
except urllib.error.HTTPError as e:
raise e
return result_json
可以在国家数字图书馆的书目检索页或者进入国家数字图书馆直接进行检索。
http://opac.nlc.cn/F/B2LYQVUJH7V1LX1Q879MVTBI6FSA3XR8QTFMLLTCE1I6DSNA8G-84640?func=file&file_name=login-session
查询9787040370683的url:
http://opac.nlc.cn/F/PFLNB317HX6AYY2KELNKA7D557QKN25GNMFS7HRPYYIRQEGLQL-11279?func=find-b&find_code=ISB&request=9787040370683&local_base=NLC01&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5=
新华文轩网是商城,搜的话结果里重复书很多,而且翻页太多了吧
同问
方式一:网页抓包法
目前很多人都是采用douban的接口,但是现在豆瓣关闭的图书信息查询的接口,但是我们还是一个采用网页爬取的技术抓取信息。爬虫的方式很多中,go、php、python、java、js、.net等都是可以的
tips:可以将获取到图书信息存到数据库中,下次再次请求先查一遍数据库,如果有则不用爬虫,这样相对来说可以减轻压力,久而久之你也有很多图书的数据了。
public function getBookInfo()
{
$isbn = $_GET['isbn'];
try {
$surl = 'https://book.douban.com/isbn/' . $isbn . '/';
$headers = json_encode(get_headers($surl), true);
$headers = json_encode($headers, true);
$surl = $this->cut($headers, 'Location: ', '"');
$surl = str_replace('\', '', $surl);//302地址
$data = $this->getIsbn($surl);
$data_1 = $this->cut($data, 'application/ld+json">', '');
$data_1 = json_decode($data_1, true);
$res['title'] = $data_1['name'];//书名
$res['logo'] = $this->cut($data, 'data-pic="', '"');//图标
$author = $data_1['author'];
if (!isset($author[0]) || $author[0] == '') {
$author[0]['name'] = '未知';
}
$res['author'] = $author;//作者
//相关书籍推荐
$publisher = $this->cut($data, '出版社:</span>', '<br/>');
if ($publisher == '') {
$publisher = '未知';
}
$res['publisher'] = $publisher;//出版社
$author_desc = $this->cut($data, 'class="indent ">', '</div>');
$res['author_desc'] = $this->cut($author_desc, '<p>', '</p>');
if ($res['author_desc'] == "") {
$res['author_desc'] = '未知';
}
$res['author_desc'] = $author_desc;//作者简介
$published = $this->cut($data, '出版年:</span>', '<br/>');
if ($published == '') {
$published = '未知';
}
$res['published'] = $published;//出版年
$page = $this->cut($data, '页数:</span>', '<br/>');
if ($page == '') {
$page = '未知';
}
$res['page'] = $page;//页数
$price = $this->cut($data, '定价:</span>', '<br/>');
if ($price == '') {
$price = '未知';
}
$res['price'] = $price;//定价
$designed = $this->cut($data, '装帧:</span>', '<br/>');
if ($designed == '') {
$designed = '未知';
}
$res['designed'] = $designed;//装帧
$description = $this->cut($data, 'class="intro">', '</p>');
if ($description == '') {
$description = '未进行描述';
} else {
$description = explode('<p>', $description)[1];
}
$res['description'] = $description;//简介
return_msg(200, '请求成功', $res);
} catch (Exception $e) {
return_msg(500, '服务器内部错误', $e);
}
}
private function cut($content, $start, $end)
{
$r = explode($start, $content);
if (isset($r[1])) {
$r = explode($end, $r[1]);
return $r[0];
}
return '';
}
private function getIsbn($url) //curl get请求
{
$postUrl = $url;
$curlPost = 'GET';
$curl = curl_init();//初始化curl
curl_setopt($curl, CURLOPT_URL, $postUrl);//抓取指定网页
curl_setopt($curl, CURLOPT_HEADER, 0);//设置header
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);//要求结果为字符串且输出到屏幕上
curl_setopt($curl, CURLOPT_POST, 1);//post提交方式
curl_setopt($curl, CURLOPT_POSTFIELDS, $curlPost);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //不验证证书下同
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
$data = curl_exec($curl);//运行curl
curl_close($curl);
return $data;
}
————————————————
版权声明:本文为CSDN博主「唯一ll.惟一」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_42836388/article/details/105255714