请勿粘贴截图 import requests
from bs4 import BeautifulSoup
import json
import pymysql
import pymongo
def A(url):
i=1
# url='https://www.glmc.edu.cn/xyzy/gyyw.htm'
pagetext=requests.get(url=url).content#检验是否成功
soup=BeautifulSoup(pagetext,'lxml')
while i in range(1,5):
flat=1
if i==1:
url=url
i=i+1
flat=2
print(url)
pagetext2=requests.get(url=url).content#检验是否成功
soup=BeautifulSoup(pagetext2,'lxml')
#divs = soup.select('.clearfix>li')
elif i>=2:
a=str(502-i)
durl2='https://www.glmc.edu.cn/xyzy/gyyw/'+a+'.htm'
i=i+1
#print(i)
print(durl2)
pagetext2=requests.get(url=durl2).content#检验是否成功
soup=BeautifulSoup(pagetext2,'lxml')
divs = soup.select('.clearfix>li')
return divs
def B(divs):
items=[]
for dd in divs:
durl='https://www.glmc.edu.cn/'+dd.find_all('a')[0].get("href")#拼接的连接
title = dd.find_all('div',{'class':"title c333 fs18 lh1 block transition250"})[0].text
# print(durl)
#print(title)
dptext=requests.get(url=durl).content
dsoup=BeautifulSoup(dptext,'lxml')
author=dsoup.select('.nlb>span')[0].text
date=dsoup.select('.nlb>span')[1].text
source=dsoup.select('.nlb>span')[2].text
# items=[]
item = {
'1':title.strip(),
'2':author.strip(),
'3': date.strip(),
'4': source.strip(),
}
items.append(item)
#print(items)
# print(author)
# print(date)
# print(source)
# author = div[0].get_text()
#print(dsoup)
# return durl,title#测试自己是否爬取成功
# return items
jsonList = json.dumps(items,indent=2, ensure_ascii=False)
# print(jsonList)
# 写入文件
with open("record.json", "w", encoding='gbk') as f:
f.write(jsonList)
def C():
db=pymysql.connect(#第一步连接到数据库
host='localhost', # 数据库服务器ip,Windows上的数据库,使用回环ip或者localhost
port=3306,#端口号,默认电脑3306
user='root',#用户
password='123456789',#登录密码
db='glmc'#数据库名
)
cursor=db.cursor()#第二步创建游标
with open("record.json", "r", encoding='gbk') as f2:
read=json.load(f2)
for each in list(read)[1:]:
i=tuple(each)
print(i)
sql='INSERT INTO news(title,author,date,source)VALUES'+str(i)
print(sql)
# sql = """create table if not exists news(title varchar(255) not null,author varchar(50) not null,date varchar(50) not null,source varchar(50) not null) """
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
#
def D():
m=pymongo.MongoClient(
host='localhost', # 数据库服务器ip,Windows上的数据库,使用回环ip或者localhost
port=3306,#端口号,默认电脑3306
user='root',#用户
password='123456789',#登录密码
db='glmc'#数据库名
)
with open("record.json", "r", encoding='gbk') as f2:
read=json.load(f2)
mydb=m.test
collection=mydb.glmc
for each in list(read)[1:]:
i=each
collection.insert_many(list(i))
if __name__=='__main__': #文件作为脚本直接执行
url='https://www.glmc.edu.cn/xyzy/gyyw.htm'
html = A(url)
B(html)# Klist =
# C(Klist)
C()
D()
修改如图:
def C():
db = pymysql.connect( # 第一步连接到数据库
host='localhost', # 数据库服务器ip,Windows上的数据库,使用回环ip或者localhost
port=3306, # 端口号,默认电脑3306
user='root', # 用户
password='root', # 登录密码
db='glmc' # 数据库名
)
cursor = db.cursor() # 第二步创建游标
with open("record.json", "r", encoding='gbk') as f2:
read = json.load(f2)
for each in list(read)[1:]:
i = list(each)
print(i)
data = ['"'+each.get(key)+'"' for key in i]
sql = 'INSERT INTO news(title,author,date,source)VALUES(' + ','.join(data) + ')'
# sql = """create table if not exists news(title varchar(255) not null,author varchar(50) not null,date varchar(50) not null,source varchar(50) not null) """
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
def D():
m = pymongo.MongoClient(
host='localhost', # 数据库服务器ip,Windows上的数据库,使用回环ip或者localhost
port=27017 # 端口号,默认电脑3306
)
db = m['glmc']
collection = db['news']
with open("record.json", "r", encoding='gbk') as f2:
read = json.load(f2)
collection.insert_many(read)
提供2种方法和实例
导入的两种方法:MongoDB Compass界面操作、Mongo Shell 命令行操作
1、Mongo Shell 命令行操作,实例链接:https://blog.csdn.net/weixin_45730243/article/details/126322697
2、MongoDB Compass界面操作,实例链接:https://blog.csdn.net/WiNdLYen/article/details/88355892?spm=1001.2101.3001.6650.6&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-6-88355892-blog-126322697.pc_relevant_aa2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-6-88355892-blog-126322697.pc_relevant_aa2&utm_relevant_index=12
db=pymysql.connect(#第一步连接到数据库
host='localhost', # 数据库服务器ip,Windows上的数据库,使用回环ip或者localhost
port=3306,#端口号,默认电脑3306
user='root',#用户
password='123456789',#登录密码
db='glmc'#数据库名
)
首先,你确认这个爬虫能不能跑,然后,上面这一段是MySQL的连接配置,你根据自己的实际修改即可。
比如,你的MySQL数据库root密码是123456,那就填写123456,你想存到哪个库哪个表里,提前建立。
运行结果和报错内容没有贴出来吖
建议直接数据库创建text类型的字符进行存储就行了针对mysql,不建议存储文件,如果需要的话mongodb存储二进制流;mysql需要定义为blob类型。
mongodb数据库导入json文件
如有帮助,望采纳
https://blog.csdn.net/weixin_45730243/article/details/126322697